[SPARK-48221][SQL] Alter string search logic for UTF8_BINARY_LCASE co…

…llation (Contains, StartsWith, EndsWith, StringLocate) ### What changes were proposed in this pull request? String searching in UTF8_BINARY_LCASE now works on character-level, rather than on byte-level. For example: `contains("İ", "i");` now returns **false**, because there exists no `start, len` such that `lowercase(substring("İ", start, len)) == "i"`. ### Why are the changes needed? Fix functions that give unusable results due to one-to-many case mapping when performing string search under UTF8_BINARY_LCASE (see example above). ### Does this PR introduce _any_ user-facing change? Yes, behaviour of `contains`, `startswith`, `endswith`, and `locate`/`position` expressions is changed for edge cases with one-to-many case mapping. ### How was this patch tested? New unit tests in `CollationSupportSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#46511 from uros-db/alter-lcase-impl. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
GideonPotok · May 28, 2024 · 2493900 · 2493900
1 parent e9a3ed8
commit 2493900
Show file tree

Hide file tree

Showing 5 changed files with 278 additions and 251 deletions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -34,6 +34,155 @@
  * Utility class for collation-aware UTF8String operations.
  */
 public class CollationAwareUTF8String {
+
+  /**
+   * The constant value to indicate that the match is not found when searching for a pattern
+   * string in a target string.
+   */
+  private static final int MATCH_NOT_FOUND = -1;
+
+  /**
+   * Returns whether the target string starts with the specified prefix, starting from the
+   * specified position (0-based index referring to character position in UTF8String), with respect
+   * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
+   * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
+   * same prefix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
+   */
+  public static boolean lowercaseMatchFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that starts with the specified
+   * prefix, starting from the specified position (0-based index referring to character position
+   * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * prefix is already lowercased. The method only considers the part of target string that
+   * starts from the specified (inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the prefix is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return length of the target substring that starts with the specified prefix in lowercase
+   */
+  private static int lowercaseMatchLengthFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    assert startPos >= 0;
+    for (int len = 0; len <= target.numChars() - startPos; ++len) {
+      if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target
+   */
+  private static int lowercaseFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    assert startPos >= 0;
+    for (int i = startPos; i <= target.numChars(); ++i) {
+      if (lowercaseMatchFrom(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns whether the target string ends with the specified suffix, ending at the specified
+   * position (0-based index referring to character position in UTF8String), with respect to the
+   * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
+   * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
+   * suffix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return whether the target string ends with the specified suffix in lowercase
+   */
+  public static boolean lowercaseMatchUntil(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that ends with the specified
+   * suffix, ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * suffix is already lowercased. The method only considers the part of target string that ends
+   * at the specified (non-inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the suffix is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return length of the target substring that ends with the specified suffix in lowercase
+   */
+  private static int lowercaseMatchLengthUntil(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    assert endPos <= target.numChars();
+    for (int len = 0; len <= endPos; ++len) {
+      if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the last occurrence of the pattern string in the target string,
+   * ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return the position of the last occurrence of pattern in target
+   */
+  private static int lowercaseRFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    assert endPos <= target.numChars();
+    for (int i = endPos; i >= 0; --i) {
+      if (lowercaseMatchUntil(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -183,6 +332,23 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
     return 0;
   }
 
+  /**
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param pattern the string to be searched for
+   * @param start the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target
+   */
+  public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
+      final int start) {
+    if (pattern.numChars() == 0) return 0;
+    return lowercaseFind(target, pattern.toLowerCase(), start);
+  }
+
   public static int indexOf(final UTF8String target, final UTF8String pattern,
       final int start, final int collationId) {
     if (pattern.numBytes() == 0) {
@@ -467,4 +633,7 @@ public static UTF8String lowercaseTrimRight(
     }
     return srcString.copyUTF8String(0, trimByteIdx);
   }
+
+  // TODO: Add more collation-aware UTF8String operations here.
+
 }
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -118,7 +118,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.contains(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.containsInLowerCase(r);
+      return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -156,7 +156,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.startsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.startsWithInLowerCase(r);
+      return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0);
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -193,7 +193,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.endsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.endsWithInLowerCase(r);
+      return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars());
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -430,7 +430,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring,
         final int start) {
-      return string.toLowerCase().indexOf(substring.toLowerCase(), start);
+      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start);
     }
     public static int execICU(final UTF8String string, final UTF8String substring, final int start,
         final int collationId) {

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -341,44 +341,6 @@ public boolean contains(final UTF8String substring) {
     return false;
   }
 
-  /**
-   * Returns whether `this` contains `substring` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean containsInLowerCase(final UTF8String substring) {
-    if (substring.numBytes == 0) {
-      return true;
-    }
-
-    // Both `this` and the `substring` are checked for non-ASCII characters, otherwise we would
-    // have to use `startsWithLowerCase(...)` in a loop, and it would frequently allocate
-    // (e.g. in case of `containsInLowerCase("1大1大1大...", "11")`)
-    if (!substring.isFullAscii()) {
-      return toLowerCase().contains(substring.toLowerCaseSlow());
-    }
-    if (!isFullAscii()) {
-      return toLowerCaseSlow().contains(substring.toLowerCaseAscii());
-    }
-
-    if (numBytes < substring.numBytes) {
-      return false;
-    }
-
-    final var firstLower = Character.toLowerCase(substring.getByte(0));
-    for (var i = 0; i <= (numBytes - substring.numBytes); i++) {
-      if (Character.toLowerCase(getByte(i)) == firstLower) {
-        final var rest = UTF8String.fromAddress(base, offset + i, numBytes - i);
-        if (rest.matchAtInLowerCaseAscii(substring, 0)) {
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
   /**
    * Returns the byte at position `i`.
    */
@@ -393,94 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) {
     return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
   }
 
-  private boolean matchAtInLowerCaseAscii(final UTF8String s, int pos) {
-    if (s.numBytes + pos > numBytes || pos < 0) {
-      return false;
-    }
-
-    for (var i = 0; i < s.numBytes; i++) {
-      if (Character.toLowerCase(getByte(pos + i)) != Character.toLowerCase(s.getByte(i))) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
   public boolean startsWith(final UTF8String prefix) {
     return matchAt(prefix, 0);
   }
 
-  /**
-   * Checks whether `prefix` is a prefix of `this` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean startsWithInLowerCase(final UTF8String prefix) {
-    // No way to match sizes of strings for early return, since single grapheme can be expanded
-    // into several independent ones in lowercase
-    if (prefix.numBytes == 0) {
-      return true;
-    }
-    if (numBytes == 0) {
-      return false;
-    }
-
-    if (!prefix.isFullAscii()) {
-      return toLowerCase().startsWith(prefix.toLowerCaseSlow());
-    }
-
-    final var part = prefix.numBytes >= numBytes ? this : UTF8String.fromAddress(
-      base, offset, prefix.numBytes);
-    if (!part.isFullAscii()) {
-      return toLowerCaseSlow().startsWith(prefix.toLowerCaseAscii());
-    }
-
-    if (numBytes < prefix.numBytes) {
-      return false;
-    }
-
-    return matchAtInLowerCaseAscii(prefix, 0);
-  }
-
   public boolean endsWith(final UTF8String suffix) {
     return matchAt(suffix, numBytes - suffix.numBytes);
   }
 
-  /**
-   * Checks whether `suffix` is a suffix of `this` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean endsWithInLowerCase(final UTF8String suffix) {
-    // No way to match sizes of strings for early return, since single grapheme can be expanded
-    // into several independent ones in lowercase
-    if (suffix.numBytes == 0) {
-      return true;
-    }
-    if (numBytes == 0) {
-      return false;
-    }
-
-    if (!suffix.isFullAscii()) {
-      return toLowerCase().endsWith(suffix.toLowerCaseSlow());
-    }
-
-    final var part = suffix.numBytes >= numBytes ? this : UTF8String.fromAddress(
-      base, offset + numBytes - suffix.numBytes, suffix.numBytes);
-    if (!part.isFullAscii()) {
-      return toLowerCaseSlow().endsWith(suffix.toLowerCaseAscii());
-    }
-
-    if (numBytes < suffix.numBytes) {
-      return false;
-    }
-
-    return matchAtInLowerCaseAscii(suffix, numBytes - suffix.numBytes);
-  }
-
   /**
    * Returns the upper case of this string
    */