From 542dcd61d170a0be0bc861e047da1c3060d43b36 Mon Sep 17 00:00:00 2001 From: Josh <5685731+marcantony@users.noreply.github.com> Date: Sat, 31 Aug 2024 11:11:21 -0400 Subject: [PATCH 1/2] Generate row-level results with withColumns Iteratively using withColumn (singular) causes performance issues when iterating over a large sequence of columns. --- src/main/scala/com/amazon/deequ/VerificationResult.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/VerificationResult.scala b/src/main/scala/com/amazon/deequ/VerificationResult.scala index 6390db821..b8fc211b3 100644 --- a/src/main/scala/com/amazon/deequ/VerificationResult.scala +++ b/src/main/scala/com/amazon/deequ/VerificationResult.scala @@ -97,10 +97,7 @@ object VerificationResult { val columnNamesToMetrics: Map[String, Column] = verificationResultToColumn(verificationResult) - val dataWithID = data.withColumn(UNIQUENESS_ID, monotonically_increasing_id()) - columnNamesToMetrics.foldLeft(dataWithID)( - (dataWithID, newColumn: (String, Column)) => - dataWithID.withColumn(newColumn._1, newColumn._2)).drop(UNIQUENESS_ID) + data.withColumns(columnNamesToMetrics) } def checkResultsAsJson(verificationResult: VerificationResult, From 54d4d7875b577aa164526e74384ab36ef8c62c44 Mon Sep 17 00:00:00 2001 From: Josh <5685731+marcantony@users.noreply.github.com> Date: Sat, 31 Aug 2024 11:34:43 -0400 Subject: [PATCH 2/2] Add back UNIQUENESS_ID --- src/main/scala/com/amazon/deequ/VerificationResult.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/amazon/deequ/VerificationResult.scala b/src/main/scala/com/amazon/deequ/VerificationResult.scala index b8fc211b3..418a622e6 100644 --- a/src/main/scala/com/amazon/deequ/VerificationResult.scala +++ b/src/main/scala/com/amazon/deequ/VerificationResult.scala @@ -97,7 +97,8 @@ object VerificationResult { val columnNamesToMetrics: Map[String, Column] = verificationResultToColumn(verificationResult) - data.withColumns(columnNamesToMetrics) + val dataWithID = data.withColumn(UNIQUENESS_ID, monotonically_increasing_id()) + dataWithID.withColumns(columnNamesToMetrics).drop(UNIQUENESS_ID) } def checkResultsAsJson(verificationResult: VerificationResult,