Skip to content

Commit

Permalink
ICU-22585 Fix infinity loop while unicode set contains single surrogate
Browse files Browse the repository at this point in the history
  • Loading branch information
FrankYFTang committed Dec 11, 2023
1 parent 7d3cd7c commit 8b14c05
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 18 deletions.
19 changes: 8 additions & 11 deletions icu4c/source/common/rbbiscan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,7 @@ void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, Unicode
RBBINode *usetNode = new RBBINode(RBBINode::uset);
if (usetNode == nullptr) {
error(U_MEMORY_ALLOCATION_ERROR);
delete setToAdopt;
return;
}
usetNode->fInputSet = setToAdopt;
Expand Down Expand Up @@ -1210,7 +1211,6 @@ RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::scanSet() {
UnicodeSet *uset;
ParsePosition pos;
int startPos;
int i;
Expand All @@ -1222,33 +1222,30 @@ void RBBIRuleScanner::scanSet() {
pos.setIndex(fScanIndex);
startPos = fScanIndex;
UErrorCode localStatus = U_ZERO_ERROR;
uset = new UnicodeSet();
if (uset == nullptr) {
localStatus = U_MEMORY_ALLOCATION_ERROR;
} else {
uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus);
if (U_FAILURE(localStatus)) {
return;
}
uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
if (U_FAILURE(localStatus)) {
// TODO: Get more accurate position of the error from UnicodeSet's return info.
// UnicodeSet appears to not be reporting correctly at this time.
#ifdef RBBI_DEBUG
RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
#endif
error(localStatus);
delete uset;
return;
}

// Verify that the set contains at least one code point.
//
U_ASSERT(uset!=nullptr);
U_ASSERT(uset.isValid());
if (uset->isEmpty()) {
// This set is empty.
// Make it an error, because it almost certainly is not what the user wanted.
// Also, avoids having to think about corner cases in the tree manipulation code
// that occurs later on.
error(U_BRK_RULE_EMPTY_SET);
delete uset;
return;
}

Expand All @@ -1257,7 +1254,7 @@ void RBBIRuleScanner::scanSet() {
// Don't just set fScanIndex because the line/char positions maintained
// for error reporting would be thrown off.
i = pos.getIndex();
for (;;) {
for (;U_SUCCESS(*fRB->fStatus);) {
if (fNextIndex >= i) {
break;
}
Expand All @@ -1280,7 +1277,7 @@ void RBBIRuleScanner::scanSet() {
// character categories for run time engine.
// - Eliminates mulitiple instances of the same set.
// - Creates a new uset node if necessary (if this isn't a duplicate.)
findSetFor(n->fText, n, uset);
findSetFor(n->fText, n, uset.orphan());
}

}
Expand Down
18 changes: 18 additions & 0 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
TESTCASE_AUTO(TestBug22581);
TESTCASE_AUTO(TestBug22584);
TESTCASE_AUTO(TestBug22585);

#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
Expand Down Expand Up @@ -5860,6 +5861,23 @@ void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
expected2 == actual2);
}

// Test a single unpaired unpaired char (either surrogate low or high) in
// an Unicode set will not cause infinity loop.
void RBBITest::TestBug22585() {
UnicodeString rule = u"$a=[";
rule.append(0xdecb) // an unpaired surrogate high
.append("];");
UParseError pe {};
UErrorCode ec {U_ZERO_ERROR};
RuleBasedBreakIterator bi(rule, pe, ec);

rule = u"$a=[";
rule.append(0xd94e) // an unpaired surrogate low
.append("];");
ec = U_ZERO_ERROR;
RuleBasedBreakIterator bi2(rule, pe, ec);
}

void RBBITest::TestBug22584() {
// Creating a break iterator from a rule consisting of a very long
// literal input string caused a stack overflow when deleting the
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/rbbitst.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class RBBITest: public IntlTest {
void TestExternalBreakEngineWithFakeYue();
void TestBug22581();
void TestBug22584();
void TestBug22585();

#if U_ENABLE_TRACING
void TestTraceCreateCharacter();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ public void TestBug13590() {
assertEquals("Wrong number of breaks found", 2, breaksFound);
}

/* Test handling of unpair surrogate.
/* Test handling of unpaired surrogate.
*/
@Test
public void TestUnpairedSurrogate() {
Expand All @@ -919,24 +919,24 @@ public void TestUnpairedSurrogate() {

try {
new RuleBasedBreakIterator("a\ud800b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate.");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e);
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e);
}

try {
new RuleBasedBreakIterator("a\ude00b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate.");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e);
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e);
}


Expand All @@ -946,6 +946,30 @@ public void TestUnpairedSurrogate() {
assertEquals("Rules does not match", rules, bi.toString());
}

@Test
public void TestBug22585() {
try {
new RuleBasedBreakIterator("$a=[\udecb];");
fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e);
}

try {
new RuleBasedBreakIterator("$a=[\ud94e];");
fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e);
}
}
/* Test preceding(index) and following(index), with semi-random indexes.
* The random indexes are produced in clusters that are relatively closely spaced,
* to increase the occurrences of hits to the internal break cache.
Expand Down

0 comments on commit 8b14c05

Please sign in to comment.