Skip to content

Commit

Permalink
UTF8_BINARY_RTRIM
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxGekk committed Oct 22, 2024
1 parent 680cb04 commit a9110d9
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode, LogicalPlan, Project, Union}
import org.apache.spark.sql.catalyst.trees.TreePattern._
import org.apache.spark.sql.catalyst.util.{CollationAwareUTF8String, TypeUtils}
import org.apache.spark.sql.catalyst.util.{CollationAwareUTF8String, CollationFactory, TypeUtils}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
Expand Down Expand Up @@ -657,7 +657,14 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
}

@transient lazy val set: Set[Any] = child.dataType match {
case st: StringType if st.supportsLowercaseEquality => new InSet.LCaseSet(hset)
case st: StringType =>
if (st.supportsBinaryEquality) {
hset
} else if (st.supportsLowercaseEquality) {
new InSet.LCaseSet(hset)
} else {
new InSet.CollationSet(hset, st.collationId)
}
case t: AtomicType if !t.isInstanceOf[BinaryType] => hset
case _: NullType => hset
case _ =>
Expand Down Expand Up @@ -785,6 +792,19 @@ object InSet {
strSet.contains(CollationAwareUTF8String.lowerCaseCodePoints(elem.asInstanceOf[UTF8String]))
}
}
class CollationSet(inputSet: Set[Any], collationId: Int) extends immutable.Set[Any] {
private val collation = CollationFactory.fetchCollation(collationId)
override def incl(elem: Any): Set[Any] = inputSet.incl(elem)
override def excl(elem: Any): Set[Any] = inputSet.excl(elem)
override def iterator: Iterator[Any] = inputSet.iterator

override def contains(elem: Any): Boolean = {
assert(elem != null, "InSet guarantees non-null input")
inputSet.exists { p =>
collation.equalsFunction(p.asInstanceOf[UTF8String], elem.asInstanceOf[UTF8String])
}
}
}
}

@ExpressionDescription(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,15 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
("aBc", "UTF8_LCASE", Set("b", "aa", "xyz")) -> false,
("aBc", "UTF8_LCASE", Set("b", "AbC", null)) -> true,
(null, "UTF8_LCASE", Set("b", "AbC", null)) -> null,
(" aa", "UTF8_BINARY_RTRIM", Set(" aa")) -> true,
(" aa ", "UTF8_BINARY_RTRIM", Set(" aa")) -> true,
("a ", "UTF8_BINARY_RTRIM", Set()) -> false,
("a ", "UTF8_BINARY_RTRIM", Set("a", "b", null)) -> true,
(null, "UTF8_BINARY_RTRIM", Set("1", "2")) -> null
).foreach { case ((elem, collation, inputSet), result) =>
val hset = inputSet.map(UTF8String.fromString).asInstanceOf[Set[Any]]
val iset = inputSet.map(UTF8String.fromString).asInstanceOf[Set[Any]]
checkEvaluation(
InSet(Literal.create(elem, StringType(collation)), hset),
InSet(Literal.create(elem, StringType(collation)), iset),
result
)
}
Expand Down

0 comments on commit a9110d9

Please sign in to comment.