From b8389b1186fe4b56ca91eadfa31886a3b4a195c6 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Wed, 3 Apr 2024 13:18:02 +0300 Subject: [PATCH] ICU-22718 Export disallowed/ignored UTS 46 data for ICU4X --- .../tools/icuexportdata/icuexportdata.cpp | 56 +++++++++++++------ 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/icu4c/source/tools/icuexportdata/icuexportdata.cpp b/icu4c/source/tools/icuexportdata/icuexportdata.cpp index 4aea2c1e285a..545811e82857 100644 --- a/icu4c/source/tools/icuexportdata/icuexportdata.cpp +++ b/icu4c/source/tools/icuexportdata/icuexportdata.cpp @@ -755,9 +755,12 @@ void computeDecompositions(const char* basename, std::vector nonRecursive32; LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status)); + UBool uts46 = false; + if (uprv_strcmp(basename, "nfkd") == 0) { mainNormalizer = Normalizer2::getNFKDInstance(status); } else if (uprv_strcmp(basename, "uts46d") == 0) { + uts46 = true; mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status); } else { mainNormalizer = nfdNormalizer; @@ -817,23 +820,38 @@ void computeDecompositions(const char* basename, nfcNormalizer->normalize(dst, nfc, status); nonNfdOrRoundTrips = (src == nfc); } + if (uts46) { + // Work around https://unicode-org.atlassian.net/browse/ICU-22658 + // TODO: Remove the workaround after data corresponding to + // https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands + // for Unicode 16. + switch (c) { + case 0x2F868: + dst.truncate(0); + dst.append(UChar32(0x36FC)); + break; + case 0x2F874: + dst.truncate(0); + dst.append(UChar32(0x5F53)); + break; + case 0x2F91F: + dst.truncate(0); + dst.append(UChar32(0x243AB)); + break; + case 0x2F95F: + dst.truncate(0); + dst.append(UChar32(0x7AEE)); + break; + case 0x2F9BF: + dst.truncate(0); + dst.append(UChar32(0x45D7)); + break; + } + } int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { - // Characters that normalize to nothing or to U+FFFD (without the - // input being U+FFFD) in ICU4C's UTS 46 normalization normalize - // as in NFD in ICU4X's UTF 46 normalization in the interest - // of data size and ICU4X's normalizer being unable to handle - // normalizing to nothing. - // When UTS 46 is implemented on top of ICU4X, a preprocessing - // step is supposed to remove these characters before the - // normalization step. - if (uprv_strcmp(basename, "uts46d") != 0) { - status.set(U_INTERNAL_PROGRAM_ERROR); - handleError(status, basename); - } - nfdNormalizer->normalize(src, dst, status); - len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); - if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + if (!uts46) { status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } @@ -951,7 +969,13 @@ void computeDecompositions(const char* basename, if (!nonNfdOrRoundTrips) { compositionPassthroughBound = c; } - if (len == 1 && utf32[0] <= 0xFFFF) { + if (!len) { + if (!uts46) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + pendingTrieInsertions.push_back({c, 0xFFFFFFFF, false}); + } else if (len == 1 && utf32[0] <= 0xFFFF) { if (startsWithBackwardCombiningStarter) { if (mainNormalizer == nfdNormalizer) { // Not supposed to happen in NFD