diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee0d611d7e652..0d0094d8d0a03 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -34,6 +34,155 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + /** + * The constant value to indicate that the match is not found when searching for a pattern + * string in a target string. + */ + private static final int MATCH_NOT_FOUND = -1; + + /** + * Returns whether the target string starts with the specified prefix, starting from the + * specified position (0-based index referring to character position in UTF8String), with respect + * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased + * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the + * same prefix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE + */ + public static boolean lowercaseMatchFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that starts with the specified + * prefix, starting from the specified position (0-based index referring to character position + * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * prefix is already lowercased. The method only considers the part of target string that + * starts from the specified (inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the prefix is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return length of the target substring that starts with the specified prefix in lowercase + */ + private static int lowercaseMatchLengthFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; + for (int len = 0; len <= target.numChars() - startPos; ++len) { + if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target + */ + private static int lowercaseFind( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; + for (int i = startPos; i <= target.numChars(); ++i) { + if (lowercaseMatchFrom(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns whether the target string ends with the specified suffix, ending at the specified + * position (0-based index referring to character position in UTF8String), with respect to the + * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior + * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same + * suffix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return whether the target string ends with the specified suffix in lowercase + */ + public static boolean lowercaseMatchUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that ends with the specified + * suffix, ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * suffix is already lowercased. The method only considers the part of target string that ends + * at the specified (non-inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the suffix is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return length of the target substring that ends with the specified suffix in lowercase + */ + private static int lowercaseMatchLengthUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); + for (int len = 0; len <= endPos; ++len) { + if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the last occurrence of the pattern string in the target string, + * ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return the position of the last occurrence of pattern in target + */ + private static int lowercaseRFind( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); + for (int i = endPos; i >= 0; --i) { + if (lowercaseMatchUntil(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + public static UTF8String replace(final UTF8String src, final UTF8String search, final UTF8String replace, final int collationId) { // This collation aware implementation is based on existing implementation on UTF8String @@ -183,6 +332,23 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + /** + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param pattern the string to be searched for + * @param start the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target + */ + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return 0; + return lowercaseFind(target, pattern.toLowerCase(), start); + } + public static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { if (pattern.numBytes() == 0) { @@ -467,4 +633,7 @@ public static UTF8String lowercaseTrimRight( } return srcString.copyUTF8String(0, trimByteIdx); } + + // TODO: Add more collation-aware UTF8String operations here. + } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index bea3dc08b4489..8f7aed30464cc 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -118,7 +118,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.containsInLowerCase(r); + return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0; } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -156,7 +156,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.startsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.startsWithInLowerCase(r); + return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -193,7 +193,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.endsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.endsWithInLowerCase(r); + return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars()); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -430,7 +430,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring } public static int execLowercase(final UTF8String string, final UTF8String substring, final int start) { - return string.toLowerCase().indexOf(substring.toLowerCase(), start); + return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start); } public static int execICU(final UTF8String string, final UTF8String substring, final int start, final int collationId) { diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 20b26b6ebc5a5..03286e0635287 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -341,44 +341,6 @@ public boolean contains(final UTF8String substring) { return false; } - /** - * Returns whether `this` contains `substring` in a lowercase unicode-aware manner - * - * This function is written in a way which avoids excessive allocations in case if we work with - * bare ASCII-character strings. - */ - public boolean containsInLowerCase(final UTF8String substring) { - if (substring.numBytes == 0) { - return true; - } - - // Both `this` and the `substring` are checked for non-ASCII characters, otherwise we would - // have to use `startsWithLowerCase(...)` in a loop, and it would frequently allocate - // (e.g. in case of `containsInLowerCase("1大1大1大...", "11")`) - if (!substring.isFullAscii()) { - return toLowerCase().contains(substring.toLowerCaseSlow()); - } - if (!isFullAscii()) { - return toLowerCaseSlow().contains(substring.toLowerCaseAscii()); - } - - if (numBytes < substring.numBytes) { - return false; - } - - final var firstLower = Character.toLowerCase(substring.getByte(0)); - for (var i = 0; i <= (numBytes - substring.numBytes); i++) { - if (Character.toLowerCase(getByte(i)) == firstLower) { - final var rest = UTF8String.fromAddress(base, offset + i, numBytes - i); - if (rest.matchAtInLowerCaseAscii(substring, 0)) { - return true; - } - } - } - - return false; - } - /** * Returns the byte at position `i`. */ @@ -393,94 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) { return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes); } - private boolean matchAtInLowerCaseAscii(final UTF8String s, int pos) { - if (s.numBytes + pos > numBytes || pos < 0) { - return false; - } - - for (var i = 0; i < s.numBytes; i++) { - if (Character.toLowerCase(getByte(pos + i)) != Character.toLowerCase(s.getByte(i))) { - return false; - } - } - - return true; - } - public boolean startsWith(final UTF8String prefix) { return matchAt(prefix, 0); } - /** - * Checks whether `prefix` is a prefix of `this` in a lowercase unicode-aware manner - * - * This function is written in a way which avoids excessive allocations in case if we work with - * bare ASCII-character strings. - */ - public boolean startsWithInLowerCase(final UTF8String prefix) { - // No way to match sizes of strings for early return, since single grapheme can be expanded - // into several independent ones in lowercase - if (prefix.numBytes == 0) { - return true; - } - if (numBytes == 0) { - return false; - } - - if (!prefix.isFullAscii()) { - return toLowerCase().startsWith(prefix.toLowerCaseSlow()); - } - - final var part = prefix.numBytes >= numBytes ? this : UTF8String.fromAddress( - base, offset, prefix.numBytes); - if (!part.isFullAscii()) { - return toLowerCaseSlow().startsWith(prefix.toLowerCaseAscii()); - } - - if (numBytes < prefix.numBytes) { - return false; - } - - return matchAtInLowerCaseAscii(prefix, 0); - } - public boolean endsWith(final UTF8String suffix) { return matchAt(suffix, numBytes - suffix.numBytes); } - /** - * Checks whether `suffix` is a suffix of `this` in a lowercase unicode-aware manner - * - * This function is written in a way which avoids excessive allocations in case if we work with - * bare ASCII-character strings. - */ - public boolean endsWithInLowerCase(final UTF8String suffix) { - // No way to match sizes of strings for early return, since single grapheme can be expanded - // into several independent ones in lowercase - if (suffix.numBytes == 0) { - return true; - } - if (numBytes == 0) { - return false; - } - - if (!suffix.isFullAscii()) { - return toLowerCase().endsWith(suffix.toLowerCaseSlow()); - } - - final var part = suffix.numBytes >= numBytes ? this : UTF8String.fromAddress( - base, offset + numBytes - suffix.numBytes, suffix.numBytes); - if (!part.isFullAscii()) { - return toLowerCaseSlow().endsWith(suffix.toLowerCaseAscii()); - } - - if (numBytes < suffix.numBytes) { - return false; - } - - return matchAtInLowerCaseAscii(suffix, numBytes - suffix.numBytes); - } - /** * Returns the upper case of this string */ diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 7fc3c4e349c3b..eb18d7665b092 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { /** @@ -101,14 +101,6 @@ public void testContains() throws SparkException { assertContains("ab世De", "AB世dE", "UNICODE_CI", true); assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - // Case-variable character length - assertContains("abİo12", "i̇o", "UNICODE_CI", true); - assertContains("abi̇o12", "İo", "UNICODE_CI", true); - assertContains("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertContains("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertContains("The İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", false); // Characters with the same binary lowercase representation assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); @@ -116,6 +108,33 @@ public void testContains() throws SparkException { assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); + // Case-variable character length + assertContains("i̇", "i", "UNICODE_CI", false); + assertContains("i̇", "\u0307", "UNICODE_CI", false); + assertContains("i̇", "İ", "UNICODE_CI", true); + assertContains("İ", "i", "UNICODE_CI", false); + assertContains("adi̇os", "io", "UNICODE_CI", false); + assertContains("adi̇os", "Io", "UNICODE_CI", false); + assertContains("adi̇os", "i̇o", "UNICODE_CI", true); + assertContains("adi̇os", "İo", "UNICODE_CI", true); + assertContains("adİos", "io", "UNICODE_CI", false); + assertContains("adİos", "Io", "UNICODE_CI", false); + assertContains("adİos", "i̇o", "UNICODE_CI", true); + assertContains("adİos", "İo", "UNICODE_CI", true); + assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertContains("İ", "\u0307", "UTF8_BINARY_LCASE", false); + assertContains("İ", "i", "UTF8_BINARY_LCASE", false); + assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true); + assertContains("İ", "i", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true); + assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", true); + assertContains("adİos", "io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", true); + assertContains("adİos", "İo", "UTF8_BINARY_LCASE", true); } private void assertStartsWith( @@ -191,13 +210,6 @@ public void testStartsWith() throws SparkException { assertStartsWith("ab世De", "AB世dE", "UNICODE_CI", true); assertStartsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - // Case-variable character length - assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); - assertStartsWith("i̇onic", "İo", "UNICODE_CI", true); - assertStartsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertStartsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertStartsWith("İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", true); - assertStartsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false); // Characters with the same binary lowercase representation assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); @@ -205,6 +217,37 @@ public void testStartsWith() throws SparkException { assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertStartsWith("KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); + // Case-variable character length + assertStartsWith("i̇", "i", "UNICODE_CI", false); + assertStartsWith("i̇", "İ", "UNICODE_CI", true); + assertStartsWith("İ", "i", "UNICODE_CI", false); + assertStartsWith("İİİ", "i̇i̇", "UNICODE_CI", true); + assertStartsWith("İİİ", "i̇i", "UNICODE_CI", false); + assertStartsWith("İi̇İ", "i̇İ", "UNICODE_CI", true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", "UNICODE_CI", false); + assertStartsWith("i̇onic", "io", "UNICODE_CI", false); + assertStartsWith("i̇onic", "Io", "UNICODE_CI", false); + assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true); + assertStartsWith("i̇onic", "İo", "UNICODE_CI", true); + assertStartsWith("İonic", "io", "UNICODE_CI", false); + assertStartsWith("İonic", "Io", "UNICODE_CI", false); + assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); + assertStartsWith("İonic", "İo", "UNICODE_CI", true); + assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); + assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false); + assertStartsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); + assertStartsWith("İİİ", "i̇i", "UTF8_BINARY_LCASE", false); + assertStartsWith("İi̇İ", "i̇İ", "UTF8_BINARY_LCASE", true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false); + assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false); + assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true); + assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", true); + assertStartsWith("İonic", "io", "UTF8_BINARY_LCASE", false); + assertStartsWith("İonic", "Io", "UTF8_BINARY_LCASE", false); + assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", true); + assertStartsWith("İonic", "İo", "UTF8_BINARY_LCASE", true); } private void assertEndsWith(String pattern, String suffix, String collationName, boolean expected) @@ -279,13 +322,6 @@ public void testEndsWith() throws SparkException { assertEndsWith("ab世De", "AB世dE", "UNICODE_CI", true); assertEndsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - // Case-variable character length - assertEndsWith("The İo", "i̇o", "UNICODE_CI", true); - assertEndsWith("The i̇o", "İo", "UNICODE_CI", true); - assertEndsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertEndsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertEndsWith("The İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true); - assertEndsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false); // Characters with the same binary lowercase representation assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true); @@ -293,6 +329,38 @@ public void testEndsWith() throws SparkException { assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false); + // Case-variable character length + assertEndsWith("i̇", "\u0307", "UNICODE_CI", false); + assertEndsWith("i̇", "İ", "UNICODE_CI", true); + assertEndsWith("İ", "i", "UNICODE_CI", false); + assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true); + assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false); + assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true); + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UNICODE_CI", false); + assertEndsWith("the i̇o", "io", "UNICODE_CI", false); + assertEndsWith("the i̇o", "Io", "UNICODE_CI", false); + assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true); + assertEndsWith("the i̇o", "İo", "UNICODE_CI", true); + assertEndsWith("the İo", "io", "UNICODE_CI", false); + assertEndsWith("the İo", "Io", "UNICODE_CI", false); + assertEndsWith("the İo", "i̇o", "UNICODE_CI", true); + assertEndsWith("the İo", "İo", "UNICODE_CI", true); + assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); + assertEndsWith("İ", "\u0307", "UTF8_BINARY_LCASE", false); + assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); + assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false); + assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true); + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertEndsWith("i̇İi̇i̇", "\u0307İİ", "UTF8_BINARY_LCASE", false); + assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true); + assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", true); + assertEndsWith("the İo", "io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the İo", "Io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", true); + assertEndsWith("the İo", "İo", "UTF8_BINARY_LCASE", true); } private void assertStringSplitSQL(String str, String delimiter, String collationName, @@ -709,12 +777,24 @@ public void testLocate() throws SparkException { assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9); assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); // Case-variable character length + assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2); + assertLocate("\u0307", "İ", 1, "UTF8_BINARY_LCASE", 0); // != UTF8_BINARY + assertLocate("i", "i̇", 1, "UNICODE_CI", 0); + assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0); + assertLocate("i̇", "i", 1, "UNICODE_CI", 0); + assertLocate("İ", "i̇", 1, "UNICODE_CI", 1); + assertLocate("İ", "i", 1, "UNICODE_CI", 0); + assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // != UNICODE_CI + assertLocate("\u0307", "i̇", 1, "UTF8_BINARY_LCASE", 2); // != UNICODE_CI + assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0); + assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 1); + assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0); assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1); assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3); assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11); assertLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1); assertLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3); - assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); // 12 instead of 11 + assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); } private void assertSubstringIndex(String string, String delimiter, Integer count, @@ -1008,3 +1088,4 @@ public void testStringTrim() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index a1aba86cfbc56..0188297fd05a2 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -215,43 +215,6 @@ public void contains() { assertFalse(fromString("大千世界").contains(fromString("大千世界好"))); } - @Test - public void containsInLowerCase() { - // Corner cases - assertTrue(EMPTY_UTF8.containsInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("a").containsInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("A").containsInLowerCase(fromString("a"))); - assertTrue(fromString("a").containsInLowerCase(fromString("A"))); - assertFalse(EMPTY_UTF8.containsInLowerCase(fromString("a"))); - // ASCII - assertTrue(fromString("hello").containsInLowerCase(fromString("ello"))); - assertFalse(fromString("hello").containsInLowerCase(fromString("vello"))); - assertFalse(fromString("hello").containsInLowerCase(fromString("hellooo"))); - // Unicode - assertTrue(fromString("大千世界").containsInLowerCase(fromString("千世界"))); - assertFalse(fromString("大千世界").containsInLowerCase(fromString("世千"))); - assertFalse(fromString("大千世界").containsInLowerCase(fromString("大千世界好"))); - // ASCII lowercase - assertTrue(fromString("HeLlO").containsInLowerCase(fromString("ElL"))); - assertFalse(fromString("HeLlO").containsInLowerCase(fromString("ElLoO"))); - // Unicode lowercase - assertTrue(fromString("ЯбЛоКо").containsInLowerCase(fromString("БлОк"))); - assertFalse(fromString("ЯбЛоКо").containsInLowerCase(fromString("лОкБ"))); - // Characters with the same binary lowercase representation - assertTrue(fromString("The Kelvin.").containsInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The Kelvin.").containsInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The KKelvin.").containsInLowerCase(fromString("KKelvin"))); - assertTrue(fromString("2 Kelvin.").containsInLowerCase(fromString("2 Kelvin"))); - assertTrue(fromString("2 Kelvin.").containsInLowerCase(fromString("2 Kelvin"))); - assertFalse(fromString("The KKelvin.").containsInLowerCase(fromString("KKelvin,"))); - // Characters with longer binary lowercase representation - assertTrue(fromString("the İodine").containsInLowerCase(fromString("the i̇odine"))); - assertTrue(fromString("the i̇odine").containsInLowerCase(fromString("the İodine"))); - assertTrue(fromString("The İodiNe").containsInLowerCase(fromString(" i̇oDin"))); - assertTrue(fromString("İodiNe").containsInLowerCase(fromString("i̇oDin"))); - assertFalse(fromString("İodiNe").containsInLowerCase(fromString(" i̇oDin"))); - } - @Test public void startsWith() { assertTrue(EMPTY_UTF8.startsWith(EMPTY_UTF8)); @@ -263,40 +226,6 @@ public void startsWith() { assertFalse(fromString("大千世界").startsWith(fromString("大千世界好"))); } - @Test - public void startsWithInLowerCase() { - // Corner cases - assertTrue(EMPTY_UTF8.startsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("a").startsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("A").startsWithInLowerCase(fromString("a"))); - assertTrue(fromString("a").startsWithInLowerCase(fromString("A"))); - assertFalse(EMPTY_UTF8.startsWithInLowerCase(fromString("a"))); - // ASCII - assertTrue(fromString("hello").startsWithInLowerCase(fromString("hell"))); - assertFalse(fromString("hello").startsWithInLowerCase(fromString("ell"))); - // Unicode - assertTrue(fromString("大千世界").startsWithInLowerCase(fromString("大千"))); - assertFalse(fromString("大千世界").startsWithInLowerCase(fromString("世千"))); - // ASCII lowercase - assertTrue(fromString("HeLlO").startsWithInLowerCase(fromString("hElL"))); - assertFalse(fromString("HeLlO").startsWithInLowerCase(fromString("ElL"))); - // Unicode lowercase - assertTrue(fromString("ЯбЛоКо").startsWithInLowerCase(fromString("яБлОк"))); - assertFalse(fromString("ЯбЛоКо").startsWithInLowerCase(fromString("БлОк"))); - // Characters with the same binary lowercase representation - assertTrue(fromString("Kelvin.").startsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("Kelvin.").startsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("KKelvin.").startsWithInLowerCase(fromString("KKelvin"))); - assertTrue(fromString("2 Kelvin.").startsWithInLowerCase(fromString("2 Kelvin"))); - assertTrue(fromString("2 Kelvin.").startsWithInLowerCase(fromString("2 Kelvin"))); - assertFalse(fromString("KKelvin.").startsWithInLowerCase(fromString("KKelvin,"))); - // Characters with longer binary lowercase representation - assertTrue(fromString("the İodine").startsWithInLowerCase(fromString("the i̇odine"))); - assertTrue(fromString("the i̇odine").startsWithInLowerCase(fromString("the İodine"))); - assertTrue(fromString("İodiNe").startsWithInLowerCase(fromString("i̇oDin"))); - assertFalse(fromString("The İodiNe").startsWithInLowerCase(fromString("i̇oDin"))); - } - @Test public void endsWith() { assertTrue(EMPTY_UTF8.endsWith(EMPTY_UTF8)); @@ -308,40 +237,6 @@ public void endsWith() { assertFalse(fromString("数据砖头").endsWith(fromString("我的数据砖头"))); } - @Test - public void endsWithInLowerCase() { - // Corner cases - assertTrue(EMPTY_UTF8.endsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("a").endsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("A").endsWithInLowerCase(fromString("a"))); - assertTrue(fromString("a").endsWithInLowerCase(fromString("A"))); - assertFalse(EMPTY_UTF8.endsWithInLowerCase(fromString("a"))); - // ASCII - assertTrue(fromString("hello").endsWithInLowerCase(fromString("ello"))); - assertFalse(fromString("hello").endsWithInLowerCase(fromString("hell"))); - // Unicode - assertTrue(fromString("大千世界").endsWithInLowerCase(fromString("世界"))); - assertFalse(fromString("大千世界").endsWithInLowerCase(fromString("大千"))); - // ASCII lowercase - assertTrue(fromString("HeLlO").endsWithInLowerCase(fromString("ElLo"))); - assertFalse(fromString("HeLlO").endsWithInLowerCase(fromString("hElL"))); - // Unicode lowercase - assertTrue(fromString("ЯбЛоКо").endsWithInLowerCase(fromString("БлОкО"))); - assertFalse(fromString("ЯбЛоКо").endsWithInLowerCase(fromString("яБлОк"))); - // Characters with the same binary lowercase representation - assertTrue(fromString("The Kelvin").endsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The Kelvin").endsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The KKelvin").endsWithInLowerCase(fromString("KKelvin"))); - assertTrue(fromString("The 2 Kelvin").endsWithInLowerCase(fromString("2 Kelvin"))); - assertTrue(fromString("The 2 Kelvin").endsWithInLowerCase(fromString("2 Kelvin"))); - assertFalse(fromString("The KKelvin").endsWithInLowerCase(fromString("KKelvin,"))); - // Characters with longer binary lowercase representation - assertTrue(fromString("the İodine").endsWithInLowerCase(fromString("the i̇odine"))); - assertTrue(fromString("the i̇odine").endsWithInLowerCase(fromString("the İodine"))); - assertTrue(fromString("The İodiNe").endsWithInLowerCase(fromString("i̇oDine"))); - assertFalse(fromString("The İodiNe").endsWithInLowerCase(fromString("i̇oDin"))); - } - @Test public void substring() { assertEquals(EMPTY_UTF8, fromString("hello").substring(0, 0));