Skip to content

Commit

Permalink
Fix bug in MinLength and MaxLength analyzers where given the NullBeha…
Browse files Browse the repository at this point in the history
…vior.EmptyString option, the where filter wasn't properly applied (#538)
  • Loading branch information
eycho-am authored Feb 26, 2024
1 parent 4070847 commit c89aad8
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 2 deletions.
3 changes: 2 additions & 1 deletion src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ case class MaxLength(column: String, where: Option[String] = None, analyzerOptio
case NullBehavior.Fail =>
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = Double.MaxValue)
case NullBehavior.EmptyString =>
length(conditionSelectionGivenColumn(col(column), Option(isNullCheck), replaceWith = "")).cast(DoubleType)
// Empty String is 0 length string
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = 0.0).cast(DoubleType)
case _ =>
colLengths
}
Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/com/amazon/deequ/analyzers/MinLength.scala
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ case class MinLength(column: String, where: Option[String] = None, analyzerOptio
case NullBehavior.Fail =>
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = Double.MinValue)
case NullBehavior.EmptyString =>
length(conditionSelectionGivenColumn(col(column), Option(isNullCheck), replaceWith = "")).cast(DoubleType)
// Empty String is 0 length string
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = 0.0).cast(DoubleType)
case _ =>
colLengths
}
Expand Down
41 changes: 41 additions & 0 deletions src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,47 @@ class VerificationSuiteTest extends WordSpec with Matchers with SparkContextSpec
assert(Seq(false, null, false, true, null, true).sameElements(rowLevel4))
}

"confirm that minLength and maxLength properly filters with nullBehavior empty" in withSparkSession { session =>
val data = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(session)

val minLength = new Check(CheckLevel.Error, "rule1")
.hasMinLength("item", _ > 3,
analyzerOptions = Option(AnalyzerOptions(NullBehavior.EmptyString, FilteredRowOutcome.NULL)))
.where("val1 > 3")
val maxLength = new Check(CheckLevel.Error, "rule2")
.hasMaxLength("item", _ <= 3,
analyzerOptions = Option(AnalyzerOptions(NullBehavior.EmptyString, FilteredRowOutcome.NULL)))
.where("val1 < 4")

val expectedColumn1 = minLength.description
val expectedColumn2 = maxLength.description

val suite = new VerificationSuite().onData(data)
.addCheck(minLength)
.addCheck(maxLength)

val result: VerificationResult = suite.run()

val resultData = VerificationResult.rowLevelResultsAsDataFrame(session, result, data)

resultData.show(false)

val expectedColumns: Set[String] =
data.columns.toSet + expectedColumn1 + expectedColumn2
assert(resultData.columns.toSet == expectedColumns)

// Unfiltered rows are all true - overall result should be Success
assert(result.status == CheckStatus.Success)

// minLength > 3 would fail for the first three rows (length 1,2,3)
val rowLevel1 = resultData.select(expectedColumn1).collect().map(r => r.getAs[Any](0))
assert(Seq(null, null, null, true, true, true).sameElements(rowLevel1))

// maxLength <= 3 would fail for the last three rows (length 4,5,6)
val rowLevel2 = resultData.select(expectedColumn2).collect().map(r => r.getAs[Any](0))
assert(Seq(true, true, true, null, null, null).sameElements(rowLevel2))
}

"generate a result that contains length row-level results with nullBehavior fail" in withSparkSession { session =>
val data = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(session)

Expand Down
15 changes: 15 additions & 0 deletions src/test/scala/com/amazon/deequ/checks/CheckTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,21 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix
assertSuccess(baseCheck.hasMaxLength("att1", _ == 4.0), context)
}

"yield correct results for minimum and maximum length stats with where clause" in
withSparkSession { sparkSession =>
val emptyNulLBehavior = Option(AnalyzerOptions(NullBehavior.EmptyString))
val baseCheck = Check(CheckLevel.Error, description = "a description")
val df = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(sparkSession)
val context = AnalysisRunner.onData(df)
.addAnalyzers(Seq(MinLength("item", Option("val1 > 3"), emptyNulLBehavior),
MaxLength("item", Option("val1 <= 3"), emptyNulLBehavior))).run()

assertSuccess(baseCheck.hasMinLength("item", _ >= 4.0, analyzerOptions = emptyNulLBehavior)
.where("val1 > 3"), context) // 1 without where clause
assertSuccess(baseCheck.hasMaxLength("item", _ <= 3.0, analyzerOptions = emptyNulLBehavior)
.where("val1 <= 3"), context) // 6 without where clause
}

"work on regular expression patterns for E-Mails" in withSparkSession { sparkSession =>
val col = "some"
val df = dataFrameWithColumn(col, StringType, sparkSession, Row("[email protected]"),
Expand Down

0 comments on commit c89aad8

Please sign in to comment.