Skip to content

Commit

Permalink
ICU-22742 Fix handling of XA,XB,XC for addLikelySubtags
Browse files Browse the repository at this point in the history
Add more tests.

ICU-22742 Add PS... variants

ICU-22742 Add java tests

ICU-22742 extend tests

ICU-22742 wrap java
  • Loading branch information
FrankYFTang authored and mihnita committed Apr 24, 2024
1 parent a91cbd6 commit 3aa8b8c
Show file tree
Hide file tree
Showing 6 changed files with 345 additions and 103 deletions.
67 changes: 30 additions & 37 deletions icu4c/source/common/loclikelysubtags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,47 +564,40 @@ LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, co
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
char c1;
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
switch (c1) {
case 'A':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
}
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'B':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
}
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'C':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
if (!returnInputIfUnmatch) {
char c1;
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
switch (c1) {
case 'A':
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'B':
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'C':
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
default: // normal locale
break;
}
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
default: // normal locale
break;
}
}

if (variant[0] == 'P' && variant[1] == 'S') {
int32_t lsrFlags = *region == 0 ?
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
if (uprv_strcmp(variant, "PSACCENT") == 0) {
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
*region == 0 ? "XA" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
return LSR(PSEUDO_BIDI_PREFIX, language, script,
*region == 0 ? "XB" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
*region == 0 ? "XC" : region, lsrFlags, errorCode);
if (variant[0] == 'P' && variant[1] == 'S') {
int32_t lsrFlags = *region == 0 ?
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
if (uprv_strcmp(variant, "PSACCENT") == 0) {
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
*region == 0 ? "XA" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
return LSR(PSEUDO_BIDI_PREFIX, language, script,
*region == 0 ? "XB" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
*region == 0 ? "XC" : region, lsrFlags, errorCode);
}
// else normal locale
}
// else normal locale
}
} // end of if (!returnInputIfUnmatch)

language = getCanonical(languageAliases, language);
// (We have no script mappings.)
Expand Down
16 changes: 8 additions & 8 deletions icu4c/source/test/cintltst/cloctst.c
Original file line number Diff line number Diff line change
Expand Up @@ -3913,17 +3913,17 @@ const char* const basic_maximize_data[][2] = {
"zh_Hani",
"zh_Hani_CN" // If change, please also update common/unicode/uloc.h
}, {
// ICU-22545
// ICU-22545 & ICU-22742
"en_XA",
"en_XA"
"en_Latn_XA"
}, {
// ICU-22545
"en_XB",
"en_XB"
// ICU-22545 & ICU-22742
"ar_XB",
"ar_Arab_XB"
}, {
// ICU-22545
"en_XC",
"en_XC"
// ICU-22545 & ICU-22742
"ru_XC",
"ru_Cyrl_XC"
}
};

Expand Down
159 changes: 149 additions & 10 deletions icu4c/source/test/intltest/loctest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
#endif
TESTCASE_AUTO(TestSetIsBogus);
TESTCASE_AUTO(TestParallelAPIValues);
TESTCASE_AUTO(TestPseudoLocales);
TESTCASE_AUTO(TestAddLikelySubtags);
TESTCASE_AUTO(TestMinimizeSubtags);
TESTCASE_AUTO(TestAddLikelyAndMinimizeSubtags);
Expand Down Expand Up @@ -1740,6 +1741,119 @@ LocaleTest::TestSetIsBogus() {
}


void LocaleTest::TestPseudoLocales() {
// input locale tag, expected locale tag
static const struct {
const char* const input;
const char* const expected;
} test_cases[] = {
// language + region, en
{ "en-XA", "en-Latn-XA" },
{ "en-XB", "en-Latn-XB" },
{ "en-XC", "en-Latn-XC" },

// language + region, ar
{ "ar-XA", "ar-Arab-XA" },
{ "ar-XB", "ar-Arab-XB" },
{ "ar-XC", "ar-Arab-XC" },

// language + region, something other than en, ar
{ "ru-XA", "ru-Cyrl-XA" },
{ "el-XB", "el-Grek-XB" },

// undefined language - region
{ "und-XA", "en-Latn-XA" },
{ "und-XB", "en-Latn-XB" },
{ "und-XC", "en-Latn-XC" },

// language + script + region
{ "und-Latn-XA", "en-Latn-XA" },
{ "und-Latn-XB", "en-Latn-XB" },
{ "und-Latn-XC", "en-Latn-XC" },
{ "und-Arab-XA", "ar-Arab-XA" },
{ "und-Arab-XB", "ar-Arab-XB" },
{ "und-Arab-XC", "ar-Arab-XC" },
{ "und-Cyrl-XA", "ru-Cyrl-XA" },
{ "und-Grek-XB", "el-Grek-XB" },

// Make sure the script is not damaged, when correct
{ "ru-Cyrl-XA", "ru-Cyrl-XA" },
{ "el-Grek-XB", "el-Grek-XB" },

// Make sure the script is not damaged, even if it is wrong
{ "ru-Grek-XA", "ru-Grek-XA" },
{ "el-Cyrl-XB", "el-Cyrl-XB" },

// PS Variants
{ "en-XA-PSACCENT", "en-Latn-XA-psaccent" },
{ "en-XA-PSBIDI", "en-Latn-XA-psbidi" },
{ "en-XA-PSCRACK", "en-Latn-XA-pscrack" },
{ "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" },
{ "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" },
{ "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" },
{ "en-XC-PSACCENT", "en-Latn-XC-psaccent" },
{ "en-XC-PSBIDI", "en-Latn-XC-psbidi" },
{ "en-XC-PSCRACK", "en-Latn-XC-pscrack" },

{ "en-US-PSACCENT", "en-Latn-US-psaccent" },
{ "en-US-PSBIDI", "en-Latn-US-psbidi" },
{ "en-US-PSCRACK", "en-Latn-US-pscrack" },
{ "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" },
{ "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" },
{ "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" },

{ "en-PSACCENT", "en-Latn-US-psaccent" },
{ "en-PSBIDI", "en-Latn-US-psbidi" },
{ "en-PSCRACK", "en-Latn-US-pscrack" },
{ "ar-PSACCENT", "ar-Arab-EG-psaccent" },
{ "ar-PSBIDI", "ar-Arab-EG-psbidi" },
{ "ar-PSCRACK", "ar-Arab-EG-pscrack" },

{ "und-US-PSACCENT", "en-Latn-US-psaccent" },
{ "und-US-PSBIDI", "en-Latn-US-psbidi" },
{ "und-US-PSCRACK", "en-Latn-US-pscrack" },
{ "und-EG-PSACCENT", "ar-Arab-EG-psaccent" },
{ "und-EG-PSBIDI", "ar-Arab-EG-psbidi" },
{ "und-EG-PSCRACK", "ar-Arab-EG-pscrack" },

{ "und-PSACCENT", "en-Latn-US-psaccent" },
{ "und-PSBIDI", "en-Latn-US-psbidi" },
{ "und-PSCRACK", "en-Latn-US-pscrack" },
{ "und-PSACCENT", "en-Latn-US-psaccent" },
{ "und-PSBIDI", "en-Latn-US-psbidi" },
{ "und-PSCRACK", "en-Latn-US-pscrack" },
};

std::string extensions("-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-somethin-more");

IcuTestErrorCode status(*this, "TestPseudoLocales()");
for (const auto& item : test_cases) {
const char* const inputTag = item.input;
const char* const expectedTag = item.expected;
Locale result = Locale::forLanguageTag(inputTag, status);
result.addLikelySubtags(status);
status.errIfFailureAndReset("\"%s\"", inputTag);
Locale expected = Locale::forLanguageTag(expectedTag, status);
status.errIfFailureAndReset("\"%s\"", expectedTag);
assertEquals(inputTag, expected.getName(), result.getName());

// Test extension
std::string extendedTag(inputTag);
extendedTag.append(extensions);

result = Locale::forLanguageTag(extendedTag, status);
result.addLikelySubtags(status);
status.errIfFailureAndReset(extendedTag.c_str());

std::string expectedExtendedTag(expectedTag);
expectedExtendedTag.append(extensions);

expected = Locale::forLanguageTag(expectedExtendedTag, status);
status.errIfFailureAndReset(expectedExtendedTag.c_str());
assertEquals(extendedTag.c_str(), expected.getName(), result.getName());
}
}

void
LocaleTest::TestAddLikelySubtags() {
IcuTestErrorCode status(*this, "TestAddLikelySubtags()");
Expand Down Expand Up @@ -3971,20 +4085,45 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
"zh_Hani_CN", // If change, please also update common/unicode/locid.h
"zh_Hani"
}, {
// ICU-22545
"en_XA",
// ICU-22545 & ICU-22742
"en_XA",
"en_Latn_XA",
"en_XA",
}, {
// ICU-22545
"en_XB",
"en_XB",
"en_XB",
// ICU-22545 & ICU-22742
"ar_XB",
"ar_Arab_XB",
"ar_XB",
}, {
// ICU-22545 & ICU-22742
"ru_XC",
"ru_Cyrl_XC",
"ru_XC",
}, {
// ICU-22742
"en_PSACCENT",
"en_Latn_US_PSACCENT",
"en__PSACCENT"
}, {
"ar_PSBIDI",
"ar_Arab_EG_PSBIDI",
"ar__PSBIDI"
}, {
"ru_PSCRACK",
"ru_Cyrl_RU_PSCRACK",
"ru__PSCRACK"
}, {
"ar_PSACCENT",
"ar_Arab_EG_PSACCENT",
"ar__PSACCENT"
}, {
"ru_PSBIDI",
"ru_Cyrl_RU_PSBIDI",
"ru__PSBIDI"
}, {
// ICU-22545
"en_XC",
"en_XC",
"en_XC",
"en_PSCRACK",
"en_Latn_US_PSCRACK",
"en__PSCRACK"
}
};

Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/loctest.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ class LocaleTest: public IntlTest {
void TestKnownCanonicalizedListCorrect();
void TestConstructorAcceptsBCP47();

void TestPseudoLocales();
void TestAddLikelySubtags();
void TestMinimizeSubtags();
void TestAddLikelyAndMinimizeSubtags();
Expand Down
Loading

0 comments on commit 3aa8b8c

Please sign in to comment.