diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/impl/Normalizer2Impl.java b/icu4j/main/core/src/main/java/com/ibm/icu/impl/Normalizer2Impl.java index cfa66582d9f2..bede3e3d610f 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/impl/Normalizer2Impl.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/impl/Normalizer2Impl.java @@ -25,7 +25,7 @@ * Low-level implementation of the Unicode Normalization Algorithm. * For the data structure and details see the documentation at the end of * C++ normalizer2impl.h and in the design doc at - * https://icu.unicode.org/design/normalization/custom + * https://unicode-org.github.io/icu/design/normalization/custom.html */ public final class Normalizer2Impl { public static final class Hangul { @@ -204,7 +204,7 @@ public void append(CharSequence s, int start, int limit, boolean isNFD, start+=Character.charCount(c); if(start>DELTA_SHIFT)-MAX_DELTA-1; + assert((minMaybeNo&7)==0); // 8-aligned for noNoDelta bit fields + centerNoNoDelta=(minMaybeNo>>DELTA_SHIFT)-MAX_DELTA-1; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; @@ -492,8 +494,7 @@ public Normalizer2Impl load(ByteBuffer bytes) { nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; int numChars=(nextOffset-offset)/2; if(numChars!=0) { - maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); - extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); + extraData=ICUBinary.getString(bytes, numChars, 0); } // smallFCD: new in formatVersion 2 @@ -606,9 +607,11 @@ public synchronized Normalizer2Impl ensureCanonIterData() { null, range)) { final int end = range.getEnd(); final int norm16 = range.getValue(); - if(isInert(norm16) || (minYesNo<=norm16 && norm16 minYesNo) { // c decomposes, get everything from the variable-length extra data - int mapping=norm16_2>>OFFSET_SHIFT; + int mapping=getDataForYesOrNo(norm16_2); int firstUnit=extraData.charAt(mapping); int length=firstUnit&MAPPING_LENGTH_MASK; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { @@ -696,14 +699,14 @@ public int getNorm16(int c) { public int getCompQuickCheck(int norm16) { if(norm16> OFFSET_SHIFT) & 0xff; } - public static int getCCFromYesOrMaybe(int norm16) { + public static int getCCFromYesOrMaybeYes(int norm16) { return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; } - public int getCCFromYesOrMaybeCP(int c) { + public int getCCFromYesOrMaybeYesCP(int c) { if (c < minCompNoMaybeCP) { return 0; } - return getCCFromYesOrMaybe(getNorm16(c)); + return getCCFromYesOrMaybeYes(getNorm16(c)); } /** @@ -757,7 +760,7 @@ public int getFCD16FromNormData(int c) { return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; - } else { // isDecompNoAlgorithmic(norm16) + } else if(norm16> OFFSET_SHIFT; @@ -772,7 +775,7 @@ public int getFCD16FromNormData(int c) { return 0; } // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; + int mapping=getData(norm16); int firstUnit=extraData.charAt(mapping); int fcd16=firstUnit>>8; // tccc if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { @@ -781,6 +784,24 @@ public int getFCD16FromNormData(int c) { return fcd16; } + private int getFCD16FromMaybeOrNonZeroCC(int norm16) { + assert norm16 >= minMaybeNo; + if (norm16 >= MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16 = getCCFromNormalYesOrMaybe(norm16); + return norm16 | (norm16<<8); + } else if (norm16 >= minMaybeYes) { + return 0; + } + // c decomposes, get everything from the variable-length extra data + int mapping = getDataForMaybe(norm16); + int firstUnit = extraData.charAt(mapping); + // maybeNo has lccc = 0 + assert (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || + (extraData.charAt(mapping - 1) & 0xff00) == 0; + return firstUnit >> 8; // tccc + } + /** * Gets the decomposition for one code point. * @param c code point @@ -788,7 +809,7 @@ public int getFCD16FromNormData(int c) { */ public String getDecomposition(int c) { int norm16; - if(c>OFFSET_SHIFT; + int mapping=getData(norm16); int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; return extraData.substring(mapping, mapping+length); } @@ -836,7 +857,7 @@ public String getRawDecomposition(int c) { return UTF16.valueOf(mapAlgorithmic(c, norm16)); } // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; + int mapping=getData(norm16); int firstUnit=extraData.charAt(mapping); int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { @@ -953,7 +974,13 @@ public boolean getCanonStartSet(int c, UnicodeSet set) { public static final int IX_MIN_NO_NO_EMPTY=17; public static final int IX_MIN_LCCC_CP=18; - public static final int IX_COUNT=20; + + /** Two-way mappings; each starts with a character that combines backward. */ + public static final int IX_MIN_MAYBE_NO=20; + /** Two-way mappings & compositions. */ + public static final int IX_MIN_MAYBE_NO_COMBINES_FWD=21; + + //blic static final int IX_COUNT=22; public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; public static final int MAPPING_HAS_RAW_MAPPING=0x40; @@ -1048,7 +1075,7 @@ public int decompose(CharSequence s, int src, int limit, decompose(c, norm16, buffer); } else { if(isDecompYes(norm16)) { - int cc=getCCFromYesOrMaybe(norm16); + int cc=getCCFromYesOrMaybeYes(norm16); if(prevCC<=cc || cc==0) { prevCC=cc; if(cc<=1) { @@ -1135,12 +1162,12 @@ public boolean compose(CharSequence s, int src, int limit, } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) + // or a "maybeYes" / "maybeNo" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. // Medium-fast path: Handle cases that do not require full decomposition and recomposition. - if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes + if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo if (!doCompose) { return false; } @@ -1165,7 +1192,7 @@ public boolean compose(CharSequence s, int src, int limit, if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } - int mapping = norm16 >> OFFSET_SHIFT; + int mapping = getDataForYesOrNo(norm16); int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; buffer.append(extraData, mapping, mapping + length); prevBoundary = src; @@ -1371,7 +1398,7 @@ public int composeQuickCheck(CharSequence s, int src, int limit, } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) + // or a "maybeYes" / "maybeNo" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. @@ -1388,8 +1415,9 @@ public int composeQuickCheck(CharSequence s, int src, int limit, } } - if(isMaybeOrNonZeroCC(norm16)) { - int cc=getCCFromYesOrMaybe(norm16); + if (norm16 >= minMaybeNo) { + int fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16); + int cc = (fcd16 >> 8) & 0xff; if (onlyContiguous /* FCC */ && cc != 0 && getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { // The [prevBoundary..prevSrc[ character @@ -1409,11 +1437,12 @@ public int composeQuickCheck(CharSequence s, int src, int limit, if (src == limit) { return (src<<1) | qcResult; // "yes" or "maybe" } - int prevCC = cc; + int prevCC = fcd16 & 0xff; c = Character.codePointAt(s, src); norm16 = getNorm16(c); - if (isMaybeOrNonZeroCC(norm16)) { - cc = getCCFromYesOrMaybe(norm16); + if (norm16 >= minMaybeNo) { + fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16); + cc = (fcd16 >> 8) & 0xff; if (!(prevCC <= cc || cc == 0)) { break; } @@ -1621,7 +1650,7 @@ public boolean norm16HasDecompBoundaryBefore(int norm16) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; } // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; + int mapping=getDataForYesOrNo(norm16); int firstUnit=extraData.charAt(mapping); // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; @@ -1640,14 +1669,15 @@ public boolean norm16HasDecompBoundaryAfter(int norm16) { return true; } if (norm16 >= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { + if (isMaybeYesOrNonZeroCC(norm16)) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } else if (norm16 < minMaybeNo) { + // Maps to an isCompYesAndZeroCC. + return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; } - // Maps to an isCompYesAndZeroCC. - return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; } // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; + int mapping=getData(norm16); int firstUnit=extraData.charAt(mapping); // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 @@ -1673,15 +1703,17 @@ public boolean isCompInert(int c, boolean onlyContiguous) { int norm16=getNorm16(c); return isCompYesAndZeroCC(norm16) && (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && - (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff); + (!onlyContiguous || isInert(norm16) || + extraData.charAt(getDataForYesOrNo(norm16)) <= 0x1ff); + // The last check fetches the mapping's first unit and checks tccc<=1. } public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); } public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); } public boolean isFCDInert(int c) { return getFCD16(c)<=1; } - private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } - private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } + private boolean isMaybe(int norm16) { return minMaybeNo<=norm16 && norm16<=JAMO_VT; } + private boolean isMaybeYesOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } private static boolean isInert(int norm16) { return norm16==INERT; } private static boolean isJamoL(int norm16) { return norm16==JAMO_L; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } @@ -1695,7 +1727,7 @@ private boolean isHangulLVT(int norm16) { // return norm16>=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } + /** Since formatVersion 5: same as isAlgorithmicNoNo() */ + private boolean isDecompNoAlgorithmic(int norm16) { + return limitNoNo<=norm16 && norm16=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; // } private int getCCFromNoNo(int norm16) { - int mapping=norm16>>OFFSET_SHIFT; + int mapping=getDataForYesOrNo(norm16); if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { return extraData.charAt(mapping-1)&0xff; } else { @@ -1733,7 +1767,7 @@ int getTrailCCFromCompYesAndZeroCC(int norm16) { return 0; // yesYes and Hangul LV have ccc=tccc=0 } else { // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. - return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo + return extraData.charAt(getDataForYesOrNo(norm16))>>8; // tccc from yesNo } } @@ -1742,23 +1776,28 @@ private int mapAlgorithmic(int c, int norm16) { return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; } - // Requires minYesNo>OFFSET_SHIFT); } + private int getDataForYesOrNo(int norm16) { + return norm16>>OFFSET_SHIFT; + } + private int getDataForMaybe(int norm16) { + return (norm16-minMaybeNo+limitNoNo)>>OFFSET_SHIFT; + } + private int getData(int norm16) { + if(norm16>=minMaybeNo) { + norm16=norm16-minMaybeNo+limitNoNo; + } + return norm16>>OFFSET_SHIFT; + } /** - * @return index into maybeYesCompositions, or -1 + * @return index into extraData, or -1 */ private int getCompositionsListForDecompYes(int norm16) { if(norm16>OFFSET_SHIFT; + // if yesYes: if Jamo L: harmless empty list + return getData(norm16); } } /** @@ -1766,16 +1805,12 @@ private int getCompositionsListForDecompYes(int norm16) { */ private int getCompositionsListForComposite(int norm16) { // A composite has both mapping & compositions list. - int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; - int firstUnit=maybeYesCompositions.charAt(list); + int list=getData(norm16); + int firstUnit=extraData.charAt(list); return list+ // mapping in maybeYesCompositions 1+ // +1 to skip the first unit with the mapping length (firstUnit&MAPPING_LENGTH_MASK); // + mapping length } - private int getCompositionsListForMaybe(int norm16) { - // minMaybeYes<=norm16>OFFSET_SHIFT; - } /** * @param c code point must have compositions * @return index into maybeYesCompositions @@ -1815,13 +1850,14 @@ private int decomposeShort( private void decompose(int c, int norm16, ReorderingBuffer buffer) { // get the decomposition and the lead and trail cc's if (norm16 >= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { - buffer.append(c, getCCFromYesOrMaybe(norm16)); + if (isMaybeYesOrNonZeroCC(norm16)) { + buffer.append(c, getCCFromYesOrMaybeYes(norm16)); return; + } else if (norm16 < minMaybeNo) { + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16 = getRawNorm16(c); } - // Maps to an isCompYesAndZeroCC. - c=mapAlgorithmic(c, norm16); - norm16 = getRawNorm16(c); } if (norm16 < minYesNo) { // c does not decompose @@ -1831,7 +1867,7 @@ private void decompose(int c, int norm16, ReorderingBuffer buffer) { Hangul.decompose(c, buffer); } else { // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; + int mapping=getData(norm16); int firstUnit=extraData.charAt(mapping); int length=firstUnit&MAPPING_LENGTH_MASK; int leadCC, trailCC; @@ -1870,20 +1906,20 @@ private void decompose(int c, int norm16, ReorderingBuffer buffer) { *

See normalizer2impl.h for a more detailed description * of the compositions list format. */ - private static int combine(String compositions, int list, int trail) { + private int combine(int list, int trail) { int key1, firstUnit; if(trail(firstUnit=compositions.charAt(list))) { + while(key1>(firstUnit=extraData.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if((firstUnit&COMP_1_TRIPLE)!=0) { - return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); + return (extraData.charAt(list+1)<<16)|extraData.charAt(list+2); } else { - return compositions.charAt(list+1); + return extraData.charAt(list+1); } } } else { @@ -1893,17 +1929,17 @@ private static int combine(String compositions, int list, int trail) { int key2=(trail<(firstUnit=compositions.charAt(list))) { + if(key1>(firstUnit=extraData.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { - if(key2>(secondUnit=compositions.charAt(list+1))) { + if(key2>(secondUnit=extraData.charAt(list+1))) { if((firstUnit&COMP_1_LAST_TUPLE)!=0) { break; } else { list+=3; } } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { - return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); + return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|extraData.charAt(list+2); } else { break; } @@ -1921,13 +1957,13 @@ private static int combine(String compositions, int list, int trail) { private void addComposites(int list, UnicodeSet set) { int firstUnit, compositeAndFwd; do { - firstUnit=maybeYesCompositions.charAt(list); + firstUnit=extraData.charAt(list); if((firstUnit&COMP_1_TRIPLE)==0) { - compositeAndFwd=maybeYesCompositions.charAt(list+1); + compositeAndFwd=extraData.charAt(list+1); list+=2; } else { - compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| - maybeYesCompositions.charAt(list+2); + compositeAndFwd=((extraData.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| + extraData.charAt(list+2); list+=3; } int composite=compositeAndFwd>>1; @@ -1973,7 +2009,7 @@ private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, c=sb.codePointAt(p); p+=Character.charCount(c); norm16=getNorm16(c); - cc=getCCFromYesOrMaybe(norm16); + cc=getCCFromYesOrMaybeYes(norm16); if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and @@ -2014,7 +2050,7 @@ private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, } compositionsList=-1; continue; - } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { + } else if((compositeAndFwd=combine(compositionsList, c))>=0) { // The starter and the combining mark (c) do combine. int composite=compositeAndFwd>>1; @@ -2119,22 +2155,27 @@ public int composePair(int a, int b) { } } else { // 'a' has a compositions list in extraData - list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; + list=getDataForYesOrNo(norm16); if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list list+= // mapping pointer 1+ // +1 to skip the first unit with the mapping length - (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length + (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length } } - } else if(norm16>1; + return combine(list, b)>>1; } /** @@ -2163,7 +2204,8 @@ private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean o /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? - (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); + (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : + extraData.charAt(getDataForYesOrNo(norm16)) <= 0x1ff); } private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { @@ -2272,11 +2314,12 @@ private void addToStartSet(MutableCodePointTrie mutableTrie, int origin, int dec private int minNoNoEmpty; private int limitNoNo; private int centerNoNoDelta; + private int minMaybeNo; + private int minMaybeNoCombinesFwd; private int minMaybeYes; private CodePointTrie.Fast16 normTrie; - private String maybeYesCompositions; - private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters + private String extraData; // mappings and/or compositions private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 private CodePointTrie canonIterData;