From b7f4b908d58389b6c0cc2512ed34759ee193c809 Mon Sep 17 00:00:00 2001 From: meek0 Date: Wed, 16 Oct 2024 09:12:56 -0400 Subject: [PATCH 1/3] fix: CLIN-2119 manage column mc with multiple entries --- .../datalake/spark3/publictables/normalized/Clinvar.scala | 7 ++++++- .../testutils/models/normalized/NormalizedClinvar.scala | 2 +- .../ferlab/datalake/testutils/models/raw/RawClinvar.scala | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala index 9f49d62a..00e31c6d 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala @@ -30,6 +30,7 @@ case class Clinvar(rc: RuntimeETLContext) extends SimpleETLP(rc) { val df = data(clinvar_vcf.id) spark.udf.register("inheritance", inheritance_udf) + spark.udf.register("mc", fusion_udf) val intermediateDf = df @@ -69,7 +70,7 @@ case class Clinvar(rc: RuntimeETLContext) extends SimpleETLP(rc) { ) .withColumn("clndisdbincl", split(concat_ws("", col("clndisdbincl")), "\\|")) .withColumn("clndnincl", split(concat_ws("", col("clndnincl")), "\\|")) - .withColumn("mc", split(concat_ws("|", col("mc")), "\\|")) + .withColumn("mc", fusion_udf(col("mc"))) .withColumn("inheritance", inheritance_udf(col("origin"))) .drop("clin_sig_original", "clndn") @@ -114,6 +115,10 @@ case class Clinvar(rc: RuntimeETLContext) extends SimpleETLP(rc) { } } + val fusion_udf: UserDefinedFunction = udf { array: mutable.WrappedArray[String] => + array.mkString("|").split("\\|") + } + implicit class DataFrameOps(df: DataFrame) { def withInterpretations: DataFrame = { diff --git a/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/normalized/NormalizedClinvar.scala b/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/normalized/NormalizedClinvar.scala index 896e5080..7514f97c 100644 --- a/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/normalized/NormalizedClinvar.scala +++ b/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/normalized/NormalizedClinvar.scala @@ -25,7 +25,7 @@ case class NormalizedClinvar(chromosome: String = "2", af_tgp: Double = 0.01118, clnvc: String = "single_nucleotide_variant", clnhgvs: List[String] = List("NC_000002.12:g.69359261T>A"), - mc: List[String] = List("SO:0001627", "intron_variant"), + mc: List[String] = List("SO:0001627", "intron_variant", "SO:0001589", "frameshift_variant"), af_esp: Double = 0.01415, clndisdbincl: List[String] = List(""), conditions: List[String] = List("Congenital myasthenic syndrome 12", "not specified", "not provided"), diff --git a/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/raw/RawClinvar.scala b/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/raw/RawClinvar.scala index 1a2b805c..5ac97f25 100644 --- a/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/raw/RawClinvar.scala +++ b/datalake-test-utils/src/main/scala/bio/ferlab/datalake/testutils/models/raw/RawClinvar.scala @@ -31,7 +31,7 @@ case class RawClinvar(contigName: String = "2", INFO_AF_TGP: Double = 0.01118, INFO_CLNVC: String = "single_nucleotide_variant", INFO_CLNHGVS: List[String] = List("NC_000002.12:g.69359261T>A"), - INFO_MC: List[String] = List("SO:0001627|intron_variant"), + INFO_MC: List[String] = List("SO:0001627|intron_variant", "SO:0001589|frameshift_variant"), INFO_CLNSIGCONF: Option[List[String]] = None, INFO_AF_ESP: Double = 0.01415, INFO_CLNDISDBINCL: Option[List[String]] = None, From 2ad96f4804f0e56ced0890168308bcd0ee8ebe54 Mon Sep 17 00:00:00 2001 From: meek0 Date: Wed, 16 Oct 2024 09:36:50 -0400 Subject: [PATCH 2/3] fix: CLIN-2119 github test config to use sbt/setup-sbt@v1 --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e29ce82f..280a269e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,6 +15,7 @@ jobs: distribution: 'adopt' java-version: '11' cache: 'sbt' + - uses: sbt/setup-sbt@v1 - name: Run datalake-commons tests run: sbt 'project datalake-commons' 'test' - name: Run datalake-spark3 tests From f37cada0954458612468a77f76143f10b81a0893 Mon Sep 17 00:00:00 2001 From: meek0 Date: Mon, 21 Oct 2024 13:39:09 -0400 Subject: [PATCH 3/3] fix: CLIN-2119 use array_join with split instead of udf --- .../datalake/spark3/publictables/normalized/Clinvar.scala | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala index 00e31c6d..4cfc2e54 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/Clinvar.scala @@ -30,7 +30,6 @@ case class Clinvar(rc: RuntimeETLContext) extends SimpleETLP(rc) { val df = data(clinvar_vcf.id) spark.udf.register("inheritance", inheritance_udf) - spark.udf.register("mc", fusion_udf) val intermediateDf = df @@ -70,7 +69,7 @@ case class Clinvar(rc: RuntimeETLContext) extends SimpleETLP(rc) { ) .withColumn("clndisdbincl", split(concat_ws("", col("clndisdbincl")), "\\|")) .withColumn("clndnincl", split(concat_ws("", col("clndnincl")), "\\|")) - .withColumn("mc", fusion_udf(col("mc"))) + .withColumn("mc", split(array_join(col("mc"), "|"), "\\|")) .withColumn("inheritance", inheritance_udf(col("origin"))) .drop("clin_sig_original", "clndn") @@ -115,10 +114,6 @@ case class Clinvar(rc: RuntimeETLContext) extends SimpleETLP(rc) { } } - val fusion_udf: UserDefinedFunction = udf { array: mutable.WrappedArray[String] => - array.mkString("|").split("\\|") - } - implicit class DataFrameOps(df: DataFrame) { def withInterpretations: DataFrame = {