Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22718 backport Export disallowed/ignored UTS 46 data for ICU4X #3009

Merged
merged 1 commit into from
May 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 40 additions & 16 deletions icu4c/source/tools/icuexportdata/icuexportdata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -755,9 +755,12 @@ void computeDecompositions(const char* basename,
std::vector<uint32_t> nonRecursive32;
LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));

UBool uts46 = false;

if (uprv_strcmp(basename, "nfkd") == 0) {
mainNormalizer = Normalizer2::getNFKDInstance(status);
} else if (uprv_strcmp(basename, "uts46d") == 0) {
uts46 = true;
mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status);
} else {
mainNormalizer = nfdNormalizer;
Expand Down Expand Up @@ -817,23 +820,38 @@ void computeDecompositions(const char* basename,
nfcNormalizer->normalize(dst, nfc, status);
nonNfdOrRoundTrips = (src == nfc);
}
if (uts46) {
// Work around https://unicode-org.atlassian.net/browse/ICU-22658
// TODO: Remove the workaround after data corresponding to
// https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands
// for Unicode 16.
switch (c) {
case 0x2F868:
dst.truncate(0);
dst.append(UChar32(0x36FC));
break;
case 0x2F874:
dst.truncate(0);
dst.append(UChar32(0x5F53));
break;
case 0x2F91F:
dst.truncate(0);
dst.append(UChar32(0x243AB));
break;
case 0x2F95F:
dst.truncate(0);
dst.append(UChar32(0x7AEE));
break;
case 0x2F9BF:
dst.truncate(0);
dst.append(UChar32(0x45D7));
break;
}
}
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);

if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
// Characters that normalize to nothing or to U+FFFD (without the
// input being U+FFFD) in ICU4C's UTS 46 normalization normalize
// as in NFD in ICU4X's UTF 46 normalization in the interest
// of data size and ICU4X's normalizer being unable to handle
// normalizing to nothing.
// When UTS 46 is implemented on top of ICU4X, a preprocessing
// step is supposed to remove these characters before the
// normalization step.
if (uprv_strcmp(basename, "uts46d") != 0) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
nfdNormalizer->normalize(src, dst, status);
len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
if (!uts46) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
Expand Down Expand Up @@ -951,7 +969,13 @@ void computeDecompositions(const char* basename,
if (!nonNfdOrRoundTrips) {
compositionPassthroughBound = c;
}
if (len == 1 && utf32[0] <= 0xFFFF) {
if (!len) {
if (!uts46) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
pendingTrieInsertions.push_back({c, 0xFFFFFFFF, false});
} else if (len == 1 && utf32[0] <= 0xFFFF) {
if (startsWithBackwardCombiningStarter) {
if (mainNormalizer == nfdNormalizer) {
// Not supposed to happen in NFD
Expand Down