From e8572585efeae7a39d78fba3208c84bdc0ba80ec Mon Sep 17 00:00:00 2001 From: Stephen Kestle Date: Thu, 10 Jun 2021 14:13:40 +1200 Subject: [PATCH] Validated data frames with not null columns against null schema columns. Previously, a dataframe would be considered invalid if it had not-null data, but the schema allowed the data to be nullable --- .../daria/sql/DataFrameSchemaChecker.scala | 6 ++-- .../sql/DataFrameSchemaCheckerTest.scala | 36 ++++++++++++++++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaChecker.scala b/src/main/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaChecker.scala index 7d30f371..8cdeb4ad 100644 --- a/src/main/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaChecker.scala +++ b/src/main/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaChecker.scala @@ -18,9 +18,8 @@ private[sql] class DataFrameSchemaChecker(df: DataFrame, requiredSchema: StructT case Success(namedField) => val basicMatch = namedField.name == reqField.name && - namedField.nullable == reqField.nullable && + (!namedField.nullable || reqField.nullable) && namedField.metadata == reqField.metadata - val contentMatch = reqField.dataType match { case reqSchema: StructType => namedField.dataType match { @@ -28,7 +27,8 @@ private[sql] class DataFrameSchemaChecker(df: DataFrame, requiredSchema: StructT diff(reqSchema, fieldSchema).isEmpty case _ => false } - case _ => reqField == namedField + case namedField.dataType => true + case _ => false } basicMatch && contentMatch diff --git a/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala b/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala index b7c6823c..8fbad98e 100644 --- a/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala +++ b/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala @@ -221,7 +221,7 @@ object DataFrameSchemaCheckerTest extends TestSuite with SparkSessionTestWrapper val expected = "The [StructField(name,StringType,true)] StructFields are not included in the DataFrame with the following StructFields [StructType(StructField(num1,IntegerType,true), StructField(num2,IntegerType,true))]" - + println(c.missingStructFieldsMessage()) assert(c.missingStructFieldsMessage() == expected) } @@ -363,6 +363,40 @@ object DataFrameSchemaCheckerTest extends TestSuite with SparkSessionTestWrapper } + "validates non-null column against a null schema" - { + val sourceSchema = List( + StructField( + "num", + IntegerType, + false + ) + ) + + val sourceDF = + spark.createDataFrame( + spark.sparkContext.parallelize(Seq[Row]()), + StructType(sourceSchema) + ) + + val requiredSchema = + StructType( + List( + StructField( + "num", + IntegerType, + true + ) + ) + ) + + val c = new DataFrameSchemaChecker( + sourceDF, + requiredSchema + ) + + c.validateSchema() + } + } }