-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* perf: CLIN-3037 Split Enriched SpliceAI in indel and snv * refactor: CLIN-3037 Move withSpliceAI from genes to variants
Laura Bégin
authored
Aug 21, 2024
1 parent
fa52e6e
commit 94924fd
Showing
12 changed files
with
245 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 21 additions & 25 deletions
46
...spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/enriched/SpliceAiSpec.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,62 +1,58 @@ | ||
package bio.ferlab.datalake.spark3.publictables.enriched | ||
|
||
import bio.ferlab.datalake.commons.config.DatasetConf | ||
import bio.ferlab.datalake.spark3.testutils.WithTestConfig | ||
import bio.ferlab.datalake.testutils.models.enriched.{EnrichedSpliceAi, MAX_SCORE} | ||
import bio.ferlab.datalake.testutils.models.normalized.NormalizedSpliceAi | ||
import bio.ferlab.datalake.spark3.testutils.WithTestConfig | ||
import bio.ferlab.datalake.testutils.{SparkSpec, TestETLContext} | ||
|
||
class SpliceAiSpec extends SparkSpec with WithTestConfig { | ||
|
||
import spark.implicits._ | ||
|
||
val job = new SpliceAi(TestETLContext()) | ||
val source: DatasetConf = conf.getDataset("normalized_spliceai_snv") | ||
val destination: DatasetConf = conf.getDataset("enriched_spliceai_snv") | ||
|
||
val spliceai_indel: DatasetConf = job.spliceai_indel | ||
val spliceai_snv: DatasetConf = job.spliceai_snv | ||
val destination: DatasetConf = job.mainDestination | ||
val job = SpliceAi(TestETLContext(), variantType = "snv") | ||
|
||
"transformSingle" should "transform NormalizedSpliceAi to EnrichedSpliceAi" in { | ||
val inputData = Map( | ||
spliceai_snv.id -> Seq(NormalizedSpliceAi("1")).toDF(), | ||
spliceai_indel.id -> Seq(NormalizedSpliceAi("2")).toDF(), | ||
) | ||
val inputData = Map(source.id -> Seq(NormalizedSpliceAi("1"), NormalizedSpliceAi("2")).toDF()) | ||
|
||
val resultDF = job.transformSingle(inputData) | ||
|
||
// ClassGenerator | ||
// .writeCLassFile( | ||
// "bio.ferlab.datalake.testutils.models.enriched", | ||
// "EnrichedSpliceAi", | ||
// resultDF, | ||
// "datalake-spark3/src/test/scala/") | ||
// ClassGenerator | ||
// .writeCLassFile( | ||
// "bio.ferlab.datalake.testutils.models.enriched", | ||
// "EnrichedSpliceAi", | ||
// resultDF, | ||
// "datalake-spark3/src/test/scala/") | ||
|
||
val expected = Seq(EnrichedSpliceAi("1"), EnrichedSpliceAi("2")) | ||
resultDF.as[EnrichedSpliceAi].collect() shouldBe expected | ||
} | ||
|
||
"transformSingle" should "compute max score for each variant-gene" in { | ||
"transformSingle" should "compute max score for each variant-gene" in { | ||
val inputData = Map( | ||
spliceai_snv.id -> Seq( | ||
source.id -> Seq( | ||
NormalizedSpliceAi(`chromosome` = "1", `start` = 1, `end` = 2, `reference` = "T", `alternate` = "C", `symbol` = "gene1", `ds_ag` = 1.0, `ds_al` = 2.00, `ds_dg` = 0.0, `ds_dl` = 0.0), | ||
NormalizedSpliceAi(`chromosome` = "1", `start` = 1, `end` = 2, `reference` = "T", `alternate` = "C", `symbol` = "gene2", `ds_ag` = 0.0, `ds_al` = 0.00, `ds_dg` = 0.0, `ds_dl` = 0.0), | ||
).toDF(), | ||
spliceai_indel.id -> Seq( | ||
NormalizedSpliceAi(`chromosome` = "1", `start` = 1, `end` = 2, `reference` = "T", `alternate` = "AT", `symbol` = "gene1", `ds_ag` = 1.0, `ds_al` = 1.00, `ds_dg` = 0.0, `ds_dl` = 0.0), | ||
).toDF(), | ||
NormalizedSpliceAi(`chromosome` = "2", `start` = 1, `end` = 2, `reference` = "T", `alternate` = "C", `symbol` = "gene1", `ds_ag` = 1.0, `ds_al` = 1.00, `ds_dg` = 0.0, `ds_dl` = 0.0), | ||
NormalizedSpliceAi(`chromosome` = "3", `start` = 1, `end` = 2, `reference` = "T", `alternate` = "C", `symbol` = "gene1", `ds_ag` = 1.0, `ds_al` = 1.00, `ds_dg` = 1.0, `ds_dl` = 1.0), | ||
).toDF() | ||
) | ||
|
||
val resultDF = job.transformSingle(inputData) | ||
resultDF.show(false) | ||
|
||
val expected = Seq( | ||
MAX_SCORE(`ds` = 2.00, `type` = Seq("AL")), | ||
MAX_SCORE(`ds` = 0.00, `type` = Seq("AG", "AL", "DG", "DL")), | ||
MAX_SCORE(`ds` = 1.00, `type` = Seq("AG", "AL")), | ||
MAX_SCORE(`ds` = 2.00, `type` = Some(Seq("AL"))), | ||
MAX_SCORE(`ds` = 0.00, `type` = None), | ||
MAX_SCORE(`ds` = 1.00, `type` = Some(Seq("AG", "AL"))), | ||
MAX_SCORE(`ds` = 1.00, `type` = Some(Seq("AG", "AL", "DG", "DL"))), | ||
) | ||
|
||
resultDF | ||
.select("max_score.*") | ||
.as[MAX_SCORE].collect() shouldBe expected | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters