diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala index d358c92dd62c7..d13c3c6026a23 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala @@ -21,6 +21,7 @@ import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.optimizer.NormalizeFloatingNumbers import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -52,18 +53,25 @@ class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Seria private val mapKeyDedupPolicy = SQLConf.get.getConf(SQLConf.MAP_KEY_DEDUP_POLICY) + private lazy val keyNormalizer: Any => Any = keyType match { + case FloatType => NormalizeFloatingNumbers.FLOAT_NORMALIZER + case DoubleType => NormalizeFloatingNumbers.DOUBLE_NORMALIZER + case _ => identity + } + def put(key: Any, value: Any): Unit = { if (key == null) { throw QueryExecutionErrors.nullAsMapKeyNotAllowedError() } - val index = keyToIndex.getOrDefault(key, -1) + val keyNormalized = keyNormalizer(key) + val index = keyToIndex.getOrDefault(keyNormalized, -1) if (index == -1) { if (size >= ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) { throw QueryExecutionErrors.exceedMapSizeLimitError(size) } - keyToIndex.put(key, values.length) - keys.append(key) + keyToIndex.put(keyNormalized, values.length) + keys.append(keyNormalized) values.append(value) } else { if (mapKeyDedupPolicy == SQLConf.MapKeyDedupPolicy.EXCEPTION.toString) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala index 5811f4cd4c850..3c8c49ee7fec6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{UnsafeArrayData, UnsafeRow} import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, BinaryType, IntegerType, StructType} +import org.apache.spark.sql.types.{ArrayType, BinaryType, DoubleType, IntegerType, StructType} import org.apache.spark.unsafe.Platform class ArrayBasedMapBuilderSuite extends SparkFunSuite with SQLHelper { @@ -60,6 +60,26 @@ class ArrayBasedMapBuilderSuite extends SparkFunSuite with SQLHelper { ) } + test("apply key normalization when creating") { + val builderDouble = new ArrayBasedMapBuilder(DoubleType, IntegerType) + builderDouble.put(-0.0, 1) + checkError( + exception = intercept[SparkRuntimeException](builderDouble.put(0.0, 2)), + errorClass = "DUPLICATED_MAP_KEY", + parameters = Map( + "key" -> "0.0", + "mapKeyDedupPolicy" -> "\"spark.sql.mapKeyDedupPolicy\"") + ) + } + + test("successful map normalization on build") { + val builder = new ArrayBasedMapBuilder(DoubleType, IntegerType) + builder.put(-0.0, 1) + val map = builder.build() + assert(map.numElements() == 1) + assert(ArrayBasedMapData.toScalaMap(map) == Map(0.0 -> 1)) + } + test("remove duplicated keys with last wins policy") { withSQLConf(SQLConf.MAP_KEY_DEDUP_POLICY.key -> SQLConf.MapKeyDedupPolicy.LAST_WIN.toString) { val builder = new ArrayBasedMapBuilder(IntegerType, IntegerType)