Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22585 Fix infinity loop while unicode set contains single surrogate #2736

Merged
merged 1 commit into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions icu4c/source/common/rbbiscan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,7 @@ void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, Unicode
RBBINode *usetNode = new RBBINode(RBBINode::uset);
if (usetNode == nullptr) {
error(U_MEMORY_ALLOCATION_ERROR);
delete setToAdopt;
return;
}
usetNode->fInputSet = setToAdopt;
Expand Down Expand Up @@ -1210,7 +1211,6 @@ RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::scanSet() {
UnicodeSet *uset;
ParsePosition pos;
int startPos;
int i;
Expand All @@ -1222,33 +1222,30 @@ void RBBIRuleScanner::scanSet() {
pos.setIndex(fScanIndex);
startPos = fScanIndex;
UErrorCode localStatus = U_ZERO_ERROR;
uset = new UnicodeSet();
if (uset == nullptr) {
localStatus = U_MEMORY_ALLOCATION_ERROR;
} else {
uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus);
if (U_FAILURE(localStatus)) {
return;
}
uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
if (U_FAILURE(localStatus)) {
// TODO: Get more accurate position of the error from UnicodeSet's return info.
// UnicodeSet appears to not be reporting correctly at this time.
#ifdef RBBI_DEBUG
RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
#endif
error(localStatus);
delete uset;
return;
}

// Verify that the set contains at least one code point.
//
U_ASSERT(uset!=nullptr);
U_ASSERT(uset.isValid());
if (uset->isEmpty()) {
// This set is empty.
// Make it an error, because it almost certainly is not what the user wanted.
// Also, avoids having to think about corner cases in the tree manipulation code
// that occurs later on.
error(U_BRK_RULE_EMPTY_SET);
delete uset;
return;
}

Expand All @@ -1257,7 +1254,7 @@ void RBBIRuleScanner::scanSet() {
// Don't just set fScanIndex because the line/char positions maintained
// for error reporting would be thrown off.
i = pos.getIndex();
for (;;) {
for (;U_SUCCESS(*fRB->fStatus);) {
if (fNextIndex >= i) {
break;
}
Expand All @@ -1280,7 +1277,7 @@ void RBBIRuleScanner::scanSet() {
// character categories for run time engine.
// - Eliminates mulitiple instances of the same set.
// - Creates a new uset node if necessary (if this isn't a duplicate.)
findSetFor(n->fText, n, uset);
findSetFor(n->fText, n, uset.orphan());
}

}
Expand Down
18 changes: 18 additions & 0 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
TESTCASE_AUTO(TestBug22581);
TESTCASE_AUTO(TestBug22584);
TESTCASE_AUTO(TestBug22585);

#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
Expand Down Expand Up @@ -5860,6 +5861,23 @@ void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
expected2 == actual2);
}

// Test a single unpaired unpaired char (either surrogate low or high) in
// an Unicode set will not cause infinity loop.
void RBBITest::TestBug22585() {
UnicodeString rule = u"$a=[";
rule.append(0xdecb) // an unpaired surrogate high
.append("];");
UParseError pe {};
UErrorCode ec {U_ZERO_ERROR};
RuleBasedBreakIterator bi(rule, pe, ec);

rule = u"$a=[";
rule.append(0xd94e) // an unpaired surrogate low
.append("];");
ec = U_ZERO_ERROR;
RuleBasedBreakIterator bi2(rule, pe, ec);
}

void RBBITest::TestBug22584() {
// Creating a break iterator from a rule consisting of a very long
// literal input string caused a stack overflow when deleting the
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/rbbitst.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class RBBITest: public IntlTest {
void TestExternalBreakEngineWithFakeYue();
void TestBug22581();
void TestBug22584();
void TestBug22585();

#if U_ENABLE_TRACING
void TestTraceCreateCharacter();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ public void TestBug13590() {
assertEquals("Wrong number of breaks found", 2, breaksFound);
}

/* Test handling of unpair surrogate.
/* Test handling of unpaired surrogate.
*/
@Test
public void TestUnpairedSurrogate() {
Expand All @@ -919,24 +919,24 @@ public void TestUnpairedSurrogate() {

try {
new RuleBasedBreakIterator("a\ud800b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate.");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e);
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e);
}

try {
new RuleBasedBreakIterator("a\ude00b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate.");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e);
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e);
}


Expand All @@ -946,6 +946,30 @@ public void TestUnpairedSurrogate() {
assertEquals("Rules does not match", rules, bi.toString());
}

@Test
public void TestBug22585() {
try {
new RuleBasedBreakIterator("$a=[\udecb];");
fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e);
}

try {
new RuleBasedBreakIterator("$a=[\ud94e];");
fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e);
}
}
/* Test preceding(index) and following(index), with semi-random indexes.
* The random indexes are produced in clusters that are relatively closely spaced,
* to increase the occurrences of hits to the internal break cache.
Expand Down
Loading