From a48801ec7f3f324c33aa2706d33ebc2271fc79d7 Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Sat, 9 Dec 2023 01:45:39 -0800 Subject: [PATCH] ICU-22585 Fix infinity loop while unicode set contains single surrogate --- icu4c/source/common/rbbiscan.cpp | 19 ++++------ icu4c/source/test/intltest/rbbitst.cpp | 18 +++++++++ icu4c/source/test/intltest/rbbitst.h | 1 + .../com/ibm/icu/dev/test/rbbi/RBBITest.java | 38 +++++++++++++++---- 4 files changed, 58 insertions(+), 18 deletions(-) diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index df75212a191e..121e5c26c8bf 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -770,6 +770,7 @@ void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, Unicode RBBINode *usetNode = new RBBINode(RBBINode::uset); if (usetNode == nullptr) { error(U_MEMORY_ALLOCATION_ERROR); + delete setToAdopt; return; } usetNode->fInputSet = setToAdopt; @@ -1210,7 +1211,6 @@ RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { // //------------------------------------------------------------------------------ void RBBIRuleScanner::scanSet() { - UnicodeSet *uset; ParsePosition pos; int startPos; int i; @@ -1222,12 +1222,11 @@ void RBBIRuleScanner::scanSet() { pos.setIndex(fScanIndex); startPos = fScanIndex; UErrorCode localStatus = U_ZERO_ERROR; - uset = new UnicodeSet(); - if (uset == nullptr) { - localStatus = U_MEMORY_ALLOCATION_ERROR; - } else { - uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); + LocalPointer uset(new UnicodeSet(), localStatus); + if (U_FAILURE(localStatus)) { + return; } + uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); if (U_FAILURE(localStatus)) { // TODO: Get more accurate position of the error from UnicodeSet's return info. // UnicodeSet appears to not be reporting correctly at this time. @@ -1235,20 +1234,18 @@ void RBBIRuleScanner::scanSet() { RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex()); #endif error(localStatus); - delete uset; return; } // Verify that the set contains at least one code point. // - U_ASSERT(uset!=nullptr); + U_ASSERT(uset.isValid()); if (uset->isEmpty()) { // This set is empty. // Make it an error, because it almost certainly is not what the user wanted. // Also, avoids having to think about corner cases in the tree manipulation code // that occurs later on. error(U_BRK_RULE_EMPTY_SET); - delete uset; return; } @@ -1257,7 +1254,7 @@ void RBBIRuleScanner::scanSet() { // Don't just set fScanIndex because the line/char positions maintained // for error reporting would be thrown off. i = pos.getIndex(); - for (;;) { + for (;U_SUCCESS(*fRB->fStatus);) { if (fNextIndex >= i) { break; } @@ -1280,7 +1277,7 @@ void RBBIRuleScanner::scanSet() { // character categories for run time engine. // - Eliminates mulitiple instances of the same set. // - Creates a new uset node if necessary (if this isn't a duplicate.) - findSetFor(n->fText, n, uset); + findSetFor(n->fText, n, uset.orphan()); } } diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 16f1ea8081c7..e1d3f7c07c18 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -146,6 +146,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue); TESTCASE_AUTO(TestBug22581); TESTCASE_AUTO(TestBug22584); + TESTCASE_AUTO(TestBug22585); #if U_ENABLE_TRACING TESTCASE_AUTO(TestTraceCreateCharacter); @@ -5860,6 +5861,23 @@ void RBBITest::TestExternalBreakEngineWithFakeTaiLe() { expected2 == actual2); } +// Test a single unpaired unpaired char (either surrogate low or high) in +// an Unicode set will not cause infinity loop. +void RBBITest::TestBug22585() { + UnicodeString rule = u"$a=["; + rule.append(0xdecb) // an unpaired surrogate high + .append("];"); + UParseError pe {}; + UErrorCode ec {U_ZERO_ERROR}; + RuleBasedBreakIterator bi(rule, pe, ec); + + rule = u"$a=["; + rule.append(0xd94e) // an unpaired surrogate low + .append("];"); + ec = U_ZERO_ERROR; + RuleBasedBreakIterator bi2(rule, pe, ec); +} + void RBBITest::TestBug22584() { // Creating a break iterator from a rule consisting of a very long // literal input string caused a stack overflow when deleting the diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 79bf5091d3cb..03dd2fb3a05b 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -100,6 +100,7 @@ class RBBITest: public IntlTest { void TestExternalBreakEngineWithFakeYue(); void TestBug22581(); void TestBug22584(); + void TestBug22585(); #if U_ENABLE_TRACING void TestTraceCreateCharacter(); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java index 15f7265d0636..1503f208dce0 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -908,7 +908,7 @@ public void TestBug13590() { assertEquals("Wrong number of breaks found", 2, breaksFound); } - /* Test handling of unpair surrogate. + /* Test handling of unpaired surrogate. */ @Test public void TestUnpairedSurrogate() { @@ -919,24 +919,24 @@ public void TestUnpairedSurrogate() { try { new RuleBasedBreakIterator("a\ud800b;"); - fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate."); + fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate."); } catch (IllegalArgumentException e) { - // expected exception with unpair surrogate. + // expected exception with unpaired surrogate. } catch (Exception e) { - fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e); + fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e); } try { new RuleBasedBreakIterator("a\ude00b;"); - fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate."); + fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate."); } catch (IllegalArgumentException e) { - // expected exception with unpair surrogate. + // expected exception with unpaired surrogate. } catch (Exception e) { - fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e); + fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e); } @@ -946,6 +946,30 @@ public void TestUnpairedSurrogate() { assertEquals("Rules does not match", rules, bi.toString()); } + @Test + public void TestBug22585() { + try { + new RuleBasedBreakIterator("$a=[\udecb];"); + fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate."); + } + catch (IllegalArgumentException e) { + // expected exception with unpaired surrogate. + } + catch (Exception e) { + fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e); + } + + try { + new RuleBasedBreakIterator("$a=[\ud94e];"); + fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate."); + } + catch (IllegalArgumentException e) { + // expected exception with unpaired surrogate. + } + catch (Exception e) { + fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e); + } + } /* Test preceding(index) and following(index), with semi-random indexes. * The random indexes are produced in clusters that are relatively closely spaced, * to increase the occurrences of hits to the internal break cache.