-
Notifications
You must be signed in to change notification settings - Fork 42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Work on getting Mutect algorithm up and running #167
base: master
Are you sure you want to change the base?
Changes from 16 commits
c34d30c
aa88c53
16da202
65c68dd
cc840b9
10181ec
3fc75b2
265b142
fe6772c
310c924
571bb2a
c0503f6
5d96d8b
c8cb648
5506148
f9fe9c2
aac84f9
cde7a00
c32ef7f
fe85550
1a92281
4d5487e
b5ac34b
2820113
af3040f
3c1511d
095c2d4
b3a3f23
504c0a8
bb2755c
c7021ee
c2aa3c5
e55d1a9
7218475
2083c8c
fc5bf4d
79e3547
27bb516
0dc1ea1
31d1f00
b034e7b
b0ee22d
03aaaf8
607ebab
0f9931c
ab9b0ca
cc088c9
fb18208
8d0c286
88a5cba
d5de796
4ed4e93
00a89c1
7b5da97
b501595
ba1b236
5b313c8
d3b16d1
2444c9a
4819062
9bb2980
0b2955e
f7ad48b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
/* | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.bdgenomics.avocado.algorithms.mutect | ||
|
||
import org.bdgenomics.avocado.models.AlleleObservation | ||
import scala.math._ | ||
|
||
trait LikelihoodModel { | ||
def logLikelihood(ref: String, | ||
alt: String, | ||
obs: Iterable[AlleleObservation], | ||
f: Option[Double]): Double | ||
} | ||
|
||
case class LogOdds(m1: LikelihoodModel, m2: LikelihoodModel) { | ||
|
||
def logOdds(ref: String, alt: String, | ||
obs: Iterable[AlleleObservation], | ||
f: Option[Double]): Double = | ||
m1.logLikelihood(ref, alt, obs, f) - m2.logLikelihood(ref, alt, obs, f) | ||
} | ||
|
||
object MutectLogOdds extends LogOdds(MfmModel, M0Model) { | ||
} | ||
|
||
/** | ||
* Use for the log odds that a normal is not a heterozygous site | ||
*/ | ||
object MutectSomaticLogOdds extends LogOdds(M0Model, MHModel) { | ||
} | ||
|
||
object M0Model extends LikelihoodModel { | ||
|
||
override def logLikelihood(ref: String, | ||
alt: String, | ||
obs: Iterable[AlleleObservation], | ||
f: Option[Double]): Double = | ||
MfmModel.logLikelihood(ref, alt, obs, Some(0.0)) | ||
} | ||
|
||
/** | ||
* M_{m, 0.5} -- probability of a heterozygous site | ||
*/ | ||
object MHModel extends LikelihoodModel { | ||
|
||
override def logLikelihood(ref: String, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
alt: String, | ||
obs: Iterable[AlleleObservation], | ||
f: Option[Double]): Double = | ||
MfmModel.logLikelihood(ref, alt, obs, Some(0.5)) | ||
} | ||
|
||
/** | ||
* M_{m, f} | ||
*/ | ||
object MfmModel extends LikelihoodModel { | ||
|
||
def e(q: Int): Double = pow(10.0, -0.1 * q.toDouble) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer to use https://github.com/bigdatagenomics/adam/blob/master/adam-core/src/main/scala/org/bdgenomics/adam/util/PhredUtils.scala#L32 which precomputes the phred-to-log conversion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Weird, for some reason when I switch from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, it should have been |
||
|
||
def P_bi(obs: AlleleObservation, r: String, m: String, f: Double): Double = { | ||
val ei = e(obs.phred) | ||
if (obs.allele == r) { | ||
f * (ei / 3.0) + (1.0 - f) * (1.0 - ei) | ||
} else if (obs.allele == m) { | ||
f * (1.0 - ei) + (1.0 - f) * (ei / 3.0) | ||
} else { | ||
ei / 3.0 | ||
} | ||
} | ||
|
||
override def logLikelihood(ref: String, alt: String, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
obs: Iterable[AlleleObservation], | ||
f: Option[Double]): Double = { | ||
val fEstimate: Double = f.getOrElse(obs.count(_.allele == alt).toDouble / obs.size) | ||
obs.map { ob => log10(P_bi(ob, ref, alt, fEstimate)) }.sum | ||
} | ||
} | ||
|
||
/** | ||
* This is just a stupid stand-in -- estimate an allelic fraction and then compute | ||
* a binomial likelihood | ||
*/ | ||
object BinomialModel extends LikelihoodModel { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's remove the |
||
private val log2Pi = log10(2.0 * Pi) | ||
|
||
private def log10BinomialCoefficient(n: Int, m: Int): Double = { | ||
val logN = log10(n) | ||
val logM = log10(m) | ||
val logNminusM = log10(n - m) | ||
(n + 0.5) * logN - (m + 0.5) * logM - (n - m + 0.5) * logNminusM - 0.5 * log2Pi | ||
} | ||
|
||
private def log10BinomialLikelihood(p: Double, n: Int, k: Int): Double = { | ||
log10BinomialCoefficient(n, k) + k * log10(p) + (n - k) * log10(1.0 - p) | ||
} | ||
|
||
override def logLikelihood(ref: String, alt: String, | ||
obs: Iterable[AlleleObservation], | ||
f: Option[Double]): Double = { | ||
val (refObs, altObs) = obs.partition(_.allele == ref) | ||
val refCount = refObs.size | ||
val altCount = altObs.size | ||
val p: Double = f.getOrElse(altCount.toDouble / (refCount + altCount)) | ||
log10BinomialLikelihood(p, refCount + altCount, altCount) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/* | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.bdgenomics.avocado.algorithms.mutect | ||
|
||
import org.bdgenomics.adam.models.ReferencePosition | ||
import org.bdgenomics.avocado.models.AlleleObservation | ||
import org.bdgenomics.formats.avro.{ Contig, Genotype, Variant } | ||
|
||
/** | ||
* | ||
* @param threshold Default value of 6.3 corresponds to a 3x10-6 mutation rate, | ||
* see the Online Methods of the Mutect paper for more details. | ||
* @param somDbSnpThreshold if the site is a known dbSnp site, apply this cutoff for somatic classification | ||
* @param somNovelThreshold if the site is a novel variant, apply this cutoff for somatic classification | ||
* | ||
*/ | ||
class Mutect(val threshold: Double = 6.3, val somDbSnpThreshold: Double = 5.5, | ||
val somNovelThreshold: Double = 2.2, val f: Option[Double] = None) { | ||
|
||
val model = MutectLogOdds | ||
// TODO: do we want to do somatic classification here? | ||
val somaticModel = MutectSomaticLogOdds | ||
val alleles: Set[String] = Set("A", "T", "G", "C") | ||
|
||
// TODO make sure we only consider the tumor AlleleObservations for this calculation | ||
def constructVariant(pos: ReferencePosition, | ||
ref: String, | ||
alt: String, | ||
obs: Iterable[AlleleObservation]): (Variant, Seq[Genotype]) = { | ||
|
||
val contig = Contig.newBuilder() | ||
.setContigName(pos.referenceName) | ||
.build() | ||
|
||
val variant = Variant.newBuilder() | ||
.setStart(pos.pos) | ||
.setContig(contig) | ||
.setEnd(pos.pos + ref.length) | ||
.setReferenceAllele(ref) | ||
.setAlternateAllele(alt) | ||
.build() | ||
|
||
val genotypes = Seq() | ||
|
||
(variant, genotypes) | ||
} | ||
|
||
// TODO make sure we only consider the tumor AlleleObservations for this calculation | ||
def detect(pos: ReferencePosition, | ||
ref: String, | ||
obs: Seq[AlleleObservation]): Option[(Variant, Seq[Genotype])] = { | ||
|
||
//TODO do we want to split up normal/tumor here, or earlier in code? | ||
val tumors = obs.filter(a => a.sample == "tumor") | ||
val normals = obs.filter(a => a.sample == "normal") | ||
|
||
val rankedAlts: Seq[(Double, String)] = | ||
(alleles - ref).map { alt => | ||
(model.logOdds(ref, alt, obs, f), alt) | ||
}.toSeq.sorted.reverse | ||
|
||
//TODO here we should check/flag if there are multiple variants that pass the threshold, this is a | ||
// downstream filtering criteria we have easy access to right here. | ||
val passingOddsAlts = rankedAlts.filter(oa => oa._1 >= threshold) | ||
|
||
// TODO do we want to output any kind of info for sites where multiple passing variants are found | ||
if (passingOddsAlts.size == 1) { | ||
val alt = passingOddsAlts(0)._2 | ||
// TODO classify somatic status here? or as a downstream step? | ||
|
||
val normalNotHet = somaticModel.logOdds(ref, alt, normals, None) | ||
val dbSNPsite = false //TODO figure out if this is a dbSNP position | ||
|
||
if ((dbSNPsite && normalNotHet >= somDbSnpThreshold) || (!dbSNPsite && normalNotHet >= somNovelThreshold)) { | ||
Option(constructVariant(pos, ref, alt, obs)) | ||
} else { | ||
None | ||
} | ||
} else { | ||
// either there are 0 passing variants, or there are > 1 passing variants | ||
None | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
package org.bdgenomics.avocado.algorithms.mutect | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add the license header to this file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
import org.bdgenomics.adam.models.ReferencePosition | ||
import org.bdgenomics.avocado.algorithms.math.MathTestUtils | ||
import org.bdgenomics.avocado.models.AlleleObservation | ||
import org.scalatest.FunSuite | ||
|
||
/** | ||
* Created by john on 5/17/15. | ||
*/ | ||
class LikelihoodModelSuite extends FunSuite { | ||
//Demo data | ||
val all_c = for (read_id <- 1 to 10) yield { | ||
AlleleObservation(pos = ReferencePosition("ctg", 0L), | ||
length = 1, | ||
allele = "C", | ||
phred = 30 + (read_id - 1), // 30 to 39 | ||
mapq = Some(30), | ||
onNegativeStrand = read_id % 2 == 0, | ||
sample = "shouldntmatter", | ||
readId = 1L) | ||
} | ||
|
||
val some_muts = for (read_id <- 1 to 10) yield { | ||
AlleleObservation(pos = ReferencePosition("ctg", 0L), | ||
length = 1, | ||
allele = if (read_id <= 3) "A" else "C", //Mutants = A x 3 w/ scores (30,31,32) | ||
phred = 30 + (read_id - 1), // 30 to 39 | ||
mapq = Some(30), | ||
onNegativeStrand = read_id % 2 == 0, | ||
sample = "shouldntmatter", | ||
readId = 1L) | ||
} | ||
|
||
test("Likelihood of simple model, no errors or mutants, all reference sites") { | ||
val m0likelihood = M0Model.logLikelihood("C", "A", all_c, None) | ||
val mflikelihood = MfmModel.logLikelihood("C", "A", all_c, None) | ||
// Assuming f = 0, mflikelihood or m0likelihood | ||
// in R: | ||
// options(digits=10) | ||
// p = seq(30,39) | ||
// e = 10^(-p/10) | ||
// log10(prod(1-e)) | ||
// > -0.001901013984 | ||
MathTestUtils.assertAlmostEqual(m0likelihood, -0.001901013984) | ||
MathTestUtils.assertAlmostEqual(mflikelihood, m0likelihood) | ||
|
||
val mhlikelihood = MHModel.logLikelihood("C", "A", all_c, None) | ||
// assuming f = 0.5 mhlikelihood | ||
// in R | ||
// log10(prod(0.5*(e/3) + 0.5*(1-e))) | ||
// -3.01156717 | ||
MathTestUtils.assertAlmostEqual(mhlikelihood, -3.01156717) | ||
} | ||
test("Likelihood of simple model, all errors, no reference or mutant sites") { | ||
val m0likelihood = M0Model.logLikelihood("T", "G", all_c, None) | ||
val mflikelihood = MfmModel.logLikelihood("T", "G", all_c, None) | ||
val mhlikelihood = MHModel.logLikelihood("T", "G", all_c, None) | ||
|
||
// Assuming f = 0 or 0.5, mflikelihood or m0likelihood | ||
// in R: | ||
// options(digits=10) | ||
// p = seq(30,39) | ||
// e = 10^(-p/10) | ||
// log10(prod(e/3)) | ||
// > -39.27121 | ||
MathTestUtils.assertAlmostEqual(m0likelihood, -39.27121255) | ||
MathTestUtils.assertAlmostEqual(mflikelihood, m0likelihood) | ||
MathTestUtils.assertAlmostEqual(mhlikelihood, m0likelihood) | ||
|
||
} | ||
|
||
test("Likelihood of simple model, all mutant, no error or reference sites") { | ||
val m0likelihood = M0Model.logLikelihood("A", "C", all_c, None) | ||
// Assuming f = 0, mflikelihood or m0likelihood | ||
// in R: | ||
// options(digits=10) | ||
// p = seq(30,39) | ||
// e = 10^(-p/10) | ||
// log10(prod(e/3)) | ||
// > -39.27121 | ||
MathTestUtils.assertAlmostEqual(m0likelihood, -39.27121255) | ||
|
||
val mhlikelihood = MHModel.logLikelihood("C", "A", all_c, None) | ||
// assuming f = 0.5 mhlikelihood | ||
// in R | ||
// log10(prod(0.5*(1-e) + 0.5*(e/3))) | ||
// -3.01156717 | ||
MathTestUtils.assertAlmostEqual(mhlikelihood, -3.01156717) | ||
|
||
} | ||
|
||
test("Likelihood/log10 likelihood of small mutant, first 3 reads (30,31,32) MfModel") { | ||
val mflikelihood = MfmModel.logLikelihood("C", "A", some_muts, None) | ||
val m03likelihood = MfmModel.logLikelihood("C", "A", some_muts, Some(0.3)) | ||
val mhlikelihood = MHModel.logLikelihood("C", "A", some_muts, None) | ||
val m0likelihood = M0Model.logLikelihood("C", "A", some_muts, None) | ||
// f should be 0.3 | ||
// in R | ||
// pm = seq(30,32) | ||
// pr = seq(33,39) | ||
// em = 10^(-pm/10) | ||
// er = 10^(-pr/10) | ||
// log10(prod(0.3*(1-em) + 0.7*(em/3))*prod( 0.3*(er/3) + 0.7*(1-er))) | ||
// -2.653910268 | ||
MathTestUtils.assertAlmostEqual(mflikelihood, -2.653910268) | ||
MathTestUtils.assertAlmostEqual(m03likelihood, mflikelihood) //same if f properly calculated here | ||
|
||
// the no mutants model in R | ||
// log10(prod(0*(1-em) + 1*(em/3))*prod( 0*(er/3) + 1*(1-er))) | ||
// -10.73221105 | ||
MathTestUtils.assertAlmostEqual(m0likelihood, -10.73221105) | ||
val logOddsMutant = MutectLogOdds.logOdds("C", "A", some_muts, None) | ||
MathTestUtils.assertAlmostEqual(logOddsMutant, mflikelihood - m0likelihood) | ||
|
||
// the heterozygous site model in R: | ||
// log10(prod(0.5*(1-em) + 0.5*(em/3))*prod( 0.5*(er/3) + 0.5*(1-er))) | ||
// -3.01156717 | ||
MathTestUtils.assertAlmostEqual(mhlikelihood, -3.01156717) | ||
val logOddsHet = MutectSomaticLogOdds.logOdds("C", "A", some_muts, None) | ||
MathTestUtils.assertAlmostEqual(logOddsHet, m0likelihood - mhlikelihood) | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
override
can be removed here.