bigdatagenomics · jstjohn · Apr 24, 2015 · Apr 24, 2015 · Apr 24, 2015 · May 13, 2015
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/algorithms/mutect/LikelihoodModel.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/algorithms/mutect/LikelihoodModel.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.bdgenomics.avocado.algorithms.mutect
+
+import org.bdgenomics.avocado.models.AlleleObservation
+import scala.math._
+
+trait LikelihoodModel {
+  def logLikelihood(ref: String,
+                    alt: String,
+                    obs: Iterable[AlleleObservation],
+                    f: Option[Double]): Double
+}
+
+case class LogOdds(m1: LikelihoodModel, m2: LikelihoodModel) {
+
+  def logOdds(ref: String, alt: String,
+              obs: Iterable[AlleleObservation],
+              f: Option[Double]): Double =
+    m1.logLikelihood(ref, alt, obs, f) - m2.logLikelihood(ref, alt, obs, f)
+}
+
+object MutectLogOdds extends LogOdds(MfmModel, M0Model) {
+}
+
+/**
+ * Use for the log odds that a normal is not a heterozygous site
+ */
+object MutectSomaticLogOdds extends LogOdds(M0Model, MHModel) {
+}
+
+object M0Model extends LikelihoodModel {
+
+  override def logLikelihood(ref: String,
+                             alt: String,
+                             obs: Iterable[AlleleObservation],
+                             f: Option[Double]): Double =
+    MfmModel.logLikelihood(ref, alt, obs, Some(0.0))
+}
+
+/**
+ * M_{m, 0.5} -- probability of a heterozygous site
+ */
+object MHModel extends LikelihoodModel {
+
+  override def logLikelihood(ref: String,
+                             alt: String,
+                             obs: Iterable[AlleleObservation],
+                             f: Option[Double]): Double =
+    MfmModel.logLikelihood(ref, alt, obs, Some(0.5))
+}
+
+/**
+ * M_{m, f}
+ */
+object MfmModel extends LikelihoodModel {
+
+  def e(q: Int): Double = pow(10.0, -0.1 * q.toDouble)
+
+  def P_bi(obs: AlleleObservation, r: String, m: String, f: Double): Double = {
+    val ei = e(obs.phred)
+    if (obs.allele == r) {
+      f * (ei / 3.0) + (1.0 - f) * (1.0 - ei)
+    } else if (obs.allele == m) {
+      f * (1.0 - ei) + (1.0 - f) * (ei / 3.0)
+    } else {
+      ei / 3.0
+    }
+  }
+
+  override def logLikelihood(ref: String, alt: String,
+                             obs: Iterable[AlleleObservation],
+                             f: Option[Double]): Double = {
+    val fEstimate: Double = f.getOrElse(obs.count(_.allele == alt).toDouble / obs.size)
+    obs.map { ob => log10(P_bi(ob, ref, alt, fEstimate)) }.sum
+  }
+}
+
+/**
+ * This is just a stupid stand-in -- estimate an allelic fraction and then compute
+ * a binomial likelihood
+ */
+object BinomialModel extends LikelihoodModel {
+  private val log2Pi = log10(2.0 * Pi)
+
+  private def log10BinomialCoefficient(n: Int, m: Int): Double = {
+    val logN = log10(n)
+    val logM = log10(m)
+    val logNminusM = log10(n - m)
+    (n + 0.5) * logN - (m + 0.5) * logM - (n - m + 0.5) * logNminusM - 0.5 * log2Pi
+  }
+
+  private def log10BinomialLikelihood(p: Double, n: Int, k: Int): Double = {
+    log10BinomialCoefficient(n, k) + k * log10(p) + (n - k) * log10(1.0 - p)
+  }
+
+  override def logLikelihood(ref: String, alt: String,
+                             obs: Iterable[AlleleObservation],
+                             f: Option[Double]): Double = {
+    val (refObs, altObs) = obs.partition(_.allele == ref)
+    val refCount = refObs.size
+    val altCount = altObs.size
+    val p: Double = f.getOrElse(altCount.toDouble / (refCount + altCount))
+    log10BinomialLikelihood(p, refCount + altCount, altCount)
+  }
+}
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/algorithms/mutect/Mutect.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/algorithms/mutect/Mutect.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.bdgenomics.avocado.algorithms.mutect
+
+import org.bdgenomics.adam.models.ReferencePosition
+import org.bdgenomics.avocado.models.AlleleObservation
+import org.bdgenomics.formats.avro.{ Contig, Genotype, Variant }
+
+/**
+ *
+ * @param threshold Default value of 6.3 corresponds to a 3x10-6 mutation rate,
+ *                  see the Online Methods of the Mutect paper for more details.
+ * @param somDbSnpThreshold if the site is a known dbSnp site, apply this cutoff for somatic classification
+ * @param somNovelThreshold if the site is a novel variant, apply this cutoff for somatic classification
+ *
+ */
+class Mutect(val threshold: Double = 6.3, val somDbSnpThreshold: Double = 5.5,
+             val somNovelThreshold: Double = 2.2, val f: Option[Double] = None) {
+
+  val model = MutectLogOdds
+  // TODO: do we want to do somatic classification here?
+  val somaticModel = MutectSomaticLogOdds
+  val alleles: Set[String] = Set("A", "T", "G", "C")
+
+  // TODO make sure we only consider the tumor AlleleObservations for this calculation
+  def constructVariant(pos: ReferencePosition,
+                       ref: String,
+                       alt: String,
+                       obs: Iterable[AlleleObservation]): (Variant, Seq[Genotype]) = {
+
+    val contig = Contig.newBuilder()
+      .setContigName(pos.referenceName)
+      .build()
+
+    val variant = Variant.newBuilder()
+      .setStart(pos.pos)
+      .setContig(contig)
+      .setEnd(pos.pos + ref.length)
+      .setReferenceAllele(ref)
+      .setAlternateAllele(alt)
+      .build()
+
+    val genotypes = Seq()
+
+    (variant, genotypes)
+  }
+
+  // TODO make sure we only consider the tumor AlleleObservations for this calculation
+  def detect(pos: ReferencePosition,
+             ref: String,
+             obs: Seq[AlleleObservation]): Option[(Variant, Seq[Genotype])] = {
+
+    //TODO do we want to split up normal/tumor here, or earlier in code?
+    val tumors = obs.filter(a => a.sample == "tumor")
+    val normals = obs.filter(a => a.sample == "normal")
+
+    val rankedAlts: Seq[(Double, String)] =
+      (alleles - ref).map { alt =>
+        (model.logOdds(ref, alt, obs, f), alt)
+      }.toSeq.sorted.reverse
+
+    //TODO here we should check/flag if there are multiple variants that pass the threshold, this is a
+    // downstream filtering criteria we have easy access to right here.
+    val passingOddsAlts = rankedAlts.filter(oa => oa._1 >= threshold)
+
+    // TODO do we want to output any kind of info for sites where multiple passing variants are found
+    if (passingOddsAlts.size == 1) {
+      val alt = passingOddsAlts(0)._2
+      // TODO classify somatic status here? or as a downstream step?
+
+      val normalNotHet = somaticModel.logOdds(ref, alt, normals, None)
+      val dbSNPsite = false //TODO figure out if this is a dbSNP position
+
+      if ((dbSNPsite && normalNotHet >= somDbSnpThreshold) || (!dbSNPsite && normalNotHet >= somNovelThreshold)) {
+        Option(constructVariant(pos, ref, alt, obs))
+      } else {
+        None
+      }
+    } else {
+      // either there are 0 passing variants, or there are > 1 passing variants
+      None
+    }
+  }
+
+}
diff --git a/...o-core/src/test/scala/org/bdgenomics/avocado/algorithms/mutect/LikelihoodModelSuite.scala b/...o-core/src/test/scala/org/bdgenomics/avocado/algorithms/mutect/LikelihoodModelSuite.scala
@@ -0,0 +1,124 @@
+package org.bdgenomics.avocado.algorithms.mutect
+
+import org.bdgenomics.adam.models.ReferencePosition
+import org.bdgenomics.avocado.algorithms.math.MathTestUtils
+import org.bdgenomics.avocado.models.AlleleObservation
+import org.scalatest.FunSuite
+
+/**
+ * Created by john on 5/17/15.
+ */
+class LikelihoodModelSuite extends FunSuite {
+  //Demo data
+  val all_c = for (read_id <- 1 to 10) yield {
+    AlleleObservation(pos = ReferencePosition("ctg", 0L),
+      length = 1,
+      allele = "C",
+      phred = 30 + (read_id - 1), // 30 to 39
+      mapq = Some(30),
+      onNegativeStrand = read_id % 2 == 0,
+      sample = "shouldntmatter",
+      readId = 1L)
+  }
+
+  val some_muts = for (read_id <- 1 to 10) yield {
+    AlleleObservation(pos = ReferencePosition("ctg", 0L),
+      length = 1,
+      allele = if (read_id <= 3) "A" else "C", //Mutants = A x 3 w/ scores (30,31,32)
+      phred = 30 + (read_id - 1), // 30 to 39
+      mapq = Some(30),
+      onNegativeStrand = read_id % 2 == 0,
+      sample = "shouldntmatter",
+      readId = 1L)
+  }
+
+  test("Likelihood of simple model, no errors or mutants, all reference sites") {
+    val m0likelihood = M0Model.logLikelihood("C", "A", all_c, None)
+    val mflikelihood = MfmModel.logLikelihood("C", "A", all_c, None)
+    // Assuming f = 0, mflikelihood or m0likelihood
+    // in R:
+    // options(digits=10)
+    // p = seq(30,39)
+    // e = 10^(-p/10)
+    // log10(prod(1-e))
+    // > -0.001901013984
+    MathTestUtils.assertAlmostEqual(m0likelihood, -0.001901013984)
+    MathTestUtils.assertAlmostEqual(mflikelihood, m0likelihood)
+
+    val mhlikelihood = MHModel.logLikelihood("C", "A", all_c, None)
+    // assuming f = 0.5 mhlikelihood
+    // in R
+    // log10(prod(0.5*(e/3) + 0.5*(1-e)))
+    // -3.01156717
+    MathTestUtils.assertAlmostEqual(mhlikelihood, -3.01156717)
+  }
+  test("Likelihood of simple model, all errors, no reference or mutant sites") {
+    val m0likelihood = M0Model.logLikelihood("T", "G", all_c, None)
+    val mflikelihood = MfmModel.logLikelihood("T", "G", all_c, None)
+    val mhlikelihood = MHModel.logLikelihood("T", "G", all_c, None)
+
+    // Assuming f = 0 or 0.5, mflikelihood or m0likelihood
+    // in R:
+    // options(digits=10)
+    // p = seq(30,39)
+    // e = 10^(-p/10)
+    // log10(prod(e/3))
+    // > -39.27121
+    MathTestUtils.assertAlmostEqual(m0likelihood, -39.27121255)
+    MathTestUtils.assertAlmostEqual(mflikelihood, m0likelihood)
+    MathTestUtils.assertAlmostEqual(mhlikelihood, m0likelihood)
+
+  }
+
+  test("Likelihood of simple model, all mutant, no error or reference sites") {
+    val m0likelihood = M0Model.logLikelihood("A", "C", all_c, None)
+    // Assuming f = 0, mflikelihood or m0likelihood
+    // in R:
+    // options(digits=10)
+    // p = seq(30,39)
+    // e = 10^(-p/10)
+    // log10(prod(e/3))
+    // > -39.27121
+    MathTestUtils.assertAlmostEqual(m0likelihood, -39.27121255)
+
+    val mhlikelihood = MHModel.logLikelihood("C", "A", all_c, None)
+    // assuming f = 0.5 mhlikelihood
+    // in R
+    // log10(prod(0.5*(1-e) + 0.5*(e/3)))
+    // -3.01156717
+    MathTestUtils.assertAlmostEqual(mhlikelihood, -3.01156717)
+
+  }
+
+  test("Likelihood/log10 likelihood of small mutant, first 3 reads (30,31,32) MfModel") {
+    val mflikelihood = MfmModel.logLikelihood("C", "A", some_muts, None)
+    val m03likelihood = MfmModel.logLikelihood("C", "A", some_muts, Some(0.3))
+    val mhlikelihood = MHModel.logLikelihood("C", "A", some_muts, None)
+    val m0likelihood = M0Model.logLikelihood("C", "A", some_muts, None)
+    // f should be 0.3
+    // in R
+    // pm = seq(30,32)
+    // pr = seq(33,39)
+    // em = 10^(-pm/10)
+    // er = 10^(-pr/10)
+    // log10(prod(0.3*(1-em) + 0.7*(em/3))*prod( 0.3*(er/3) + 0.7*(1-er)))
+    // -2.653910268
+    MathTestUtils.assertAlmostEqual(mflikelihood, -2.653910268)
+    MathTestUtils.assertAlmostEqual(m03likelihood, mflikelihood) //same if f properly calculated here
+
+    // the no mutants model in R
+    // log10(prod(0*(1-em) + 1*(em/3))*prod( 0*(er/3) + 1*(1-er)))
+    // -10.73221105
+    MathTestUtils.assertAlmostEqual(m0likelihood, -10.73221105)
+    val logOddsMutant = MutectLogOdds.logOdds("C", "A", some_muts, None)
+    MathTestUtils.assertAlmostEqual(logOddsMutant, mflikelihood - m0likelihood)
+
+    // the heterozygous site model in R:
+    // log10(prod(0.5*(1-em) + 0.5*(em/3))*prod( 0.5*(er/3) + 0.5*(1-er)))
+    // -3.01156717
+    MathTestUtils.assertAlmostEqual(mhlikelihood, -3.01156717)
+    val logOddsHet = MutectSomaticLogOdds.logOdds("C", "A", some_muts, None)
+    MathTestUtils.assertAlmostEqual(logOddsHet, m0likelihood - mhlikelihood)
+  }
+
+}