Skip to content

Commit

Permalink
ICU-22039 Line Break on Orthographic Syllable Boundaries
Browse files Browse the repository at this point in the history
This is an experimental implementation of the line breaking rules proposed in the
Unicode document L2/22-080R. It is not suitable for merging into ICU main.

Limitations:
   - ICU4C only.
   - Root locale only (not implemented for the various LB tailorings).
   - New Line Break properties implemented with hard-coded UnicodeSets. (unmaintainable)
   - RBBIMonkeyTest not updated. (There are two ICU monkey tests; the other is updated.)
  • Loading branch information
aheninger authored and eggrobin committed Jul 7, 2023
1 parent 996e1c0 commit 36438a9
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 15 deletions.
20 changes: 18 additions & 2 deletions icu4c/source/data/brkitr/rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@
!!chain;
!!quoted_literals_only;

# Prototype definitions for L2/22-080R
# Line breaking at orthographic syllable boundaries

$AK = [\u1B05-\u1B33 \u1B45-\u1B4C \u25CC \uA984-\uA9B2 \U00011005-\U00011037 \U00011071-\U00011072 \U00011075 \U00011305-\U0001130C \U0001130F-\U00011310 \U00011313-\U00011328 \U0001132A-\U00011330 \U00011332-\U00011333 \U00011335-\U00011339 \U00011360-\U00011361 \U00011392-\U000113B5 \U00011F04-\U00011F10 \U00011F12-\U00011F33];

$AP = [\U00011003-\U00011004 \U000113D1 \U00011F02];

$AS = [\u1BC0-\u1BE5 \uAA00-\uAA28 \U00011066-\U0001106F \U00011350 \U0001135E-\U0001135F \U00011380-\U00011389 \U0001138B \U0001138E \U00011390-\U00011391 \U00011EE0-\U00011EF1 \U00011F50-\U00011F59];

$VF = [\u1BF2-\u1BF3];

$VI = [\u1B44 \uA9C0 \U00011046 \U0001134D \U000113D0 \U00011F42];

$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BA = [:LineBreak = Break_After:];
Expand Down Expand Up @@ -82,7 +95,7 @@ $ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.

$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
$CM = [[[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]] - [$VF $VI]];
$CMX = [[$CM] - [$ZWJ]];

# Dictionary character set, for triggering language-based break engines. Currently
Expand All @@ -97,7 +110,7 @@ $dictionary = [$SA];
# XX (Unknown, unassigned)
# as $AL (Alphabetic)
#
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
$ALPlus = [[$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]] - [$AS $AK $AP]];


## -------------------------------------------------
Expand Down Expand Up @@ -338,6 +351,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL

#LB 28b Do not break Orthographic syllables
($AP $CM*)? ($AS | $AK) ($CM* $VI $CM* $AK)* ($CM* $VI | (($CM* $AS | $CM* $AK)? $CM* $VF))?;

# LB 29
$IS $CM* ($ALPlus | $HL);

Expand Down
89 changes: 76 additions & 13 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2657,6 +2657,11 @@ class RBBILineMonkey: public RBBIMonkeyKind {
UnicodeSet *fOP30;
UnicodeSet *fCP30;
UnicodeSet *fExtPictUnassigned;
UnicodeSet *fAK;
UnicodeSet *fAP;
UnicodeSet *fAS;
UnicodeSet *fVF;
UnicodeSet *fVI;

BreakIterator *fCharBI;
const UnicodeString *fText;
Expand Down Expand Up @@ -2727,6 +2732,16 @@ RBBILineMonkey::RBBILineMonkey() :
fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);

fAK = new UnicodeSet(
u"[\u1B05-\u1B33\u1B45-\u1B4C\u25CC\uA984-\uA9B2\U00011005-\U00011037\U00011071-\U00011072\U00011075\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U00011360-\U00011361\U00011392-\U000113B5\U00011F04-\U00011F10\U00011F12-\U00011F33]",
status);
fAP = new UnicodeSet(u"[\U00011003-\U00011004\U000113D1\U00011F02]", status);
fAS = new UnicodeSet(
u"[\u1BC0-\u1BE5\uAA00-\uAA28\U00011066-\U0001106F\U00011350\U0001135E-\U0001135F\U00011380-\U00011389\U0001138B\U0001138E\U00011390-\U00011391\U00011EE0-\U00011EF1\U00011F50-\U00011F59]",
status);
fVF = new UnicodeSet(u"[\u1BF2-\u1BF3]", status);
fVI = new UnicodeSet(u"[\u1B44\uA9C0\U00011046\U0001134D\U000113D0\U00011F42]", status);

if (U_FAILURE(status)) {
deferredStatus = status;
return;
Expand All @@ -2741,6 +2756,12 @@ RBBILineMonkey::RBBILineMonkey() :

fHH->add(u'\u2010'); // Hyphen, '‐'

fAL->removeAll(*fAK);
fAL->removeAll(*fAP);
fAL->removeAll(*fAS);
fCM->removeAll(*fVF);
fCM->removeAll(*fVI);

// Sets and names.
fSets->addElement(fBK, status); classNames.push_back("fBK");
fSets->addElement(fCR, status); classNames.push_back("fCR");
Expand Down Expand Up @@ -2786,18 +2807,27 @@ RBBILineMonkey::RBBILineMonkey() :
fSets->addElement(fOP30, status); classNames.push_back("fOP30");
fSets->addElement(fCP30, status); classNames.push_back("fCP30");
fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
fSets->addElement(fAK, status); classNames.push_back("fAK");
fSets->addElement(fAP, status); classNames.push_back("fAP");
fSets->addElement(fAS, status); classNames.push_back("fAS");
fSets->addElement(fVF, status); classNames.push_back("fVF");
fSets->addElement(fVI, status); classNames.push_back("fVI");


const char *rules =
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
"((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
"((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
"\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
"((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
// Hack for orthographic syllable prototype, to adjust CM property for use in numeric regexp.
// Note that 200d adjustment is permanent.

fNumberMatcher = new RegexMatcher(
UnicodeString(rules, -1, US_INV), 0, status);
UnicodeString CMx {u"[[[\\p{Line_Break=CM}]\\u200d]-[\\u1BF2-\\u1BF3\\u1B44\\uA9C0\\U00011046\\U0001134D\\U000113D0\\U00011F42]]"};
UnicodeString rules;
rules = rules + u"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(" + CMx + u")*)?"
+ u"((\\p{Line_Break=OP}|\\p{Line_Break=HY})(" + CMx + u")*)?"
+ u"((\\p{Line_Break=IS})(" + CMx + u")*)?"
+ u"\\p{Line_Break=NU}(" + CMx + u")*"
+ u"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(" + CMx + u")*)*"
+ u"((\\p{Line_Break=CL}|\\p{Line_Break=CP})(" + CMx + u")*)?"
+ u"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(" + CMx + u")*)?";

fNumberMatcher = new RegexMatcher(rules, 0, status);

fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);

Expand Down Expand Up @@ -3289,9 +3319,37 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}

if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
continue;
if (fAP->contains(prevChar) && (fAK->contains(thisChar) || fAS->contains(thisChar))) {
setAppliedRule(pos, "LB 28b.1 AP x (AK | AS)");
continue;
}

if ((fAK->contains(prevChar) || fAS->contains(prevChar)) &&
(fVF->contains(thisChar) || fVI->contains(thisChar))) {
setAppliedRule(pos, "LB 28b.2 (AK | AS) x (VF | VI)");
continue;
}

if ((fAK->contains(prevCharX2) || fAS->contains(prevCharX2)) &&
fVI->contains(prevChar) && fAK->contains(thisChar)) {
setAppliedRule(pos, "LB 28b.3 (AK | AS) VI x AK");
continue;
}

if (nextPos < fText->length()) {
// note: UnicodeString::char32At(length) returns ffff, not distinguishable
// from a legit ffff character. So test length separately.
UChar32 nextChar = fText->char32At(nextPos);
if ((fAK->contains(prevChar) || fAS->contains(prevChar)) &&
(fAK->contains(thisChar) || fAS->contains(thisChar)) && fVF->contains(nextChar)) {
setAppliedRule(pos, "LB 28b.4 (AK | AS) x (AK | AS) VF");
continue;
}
}

if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
continue;
}

// (AL | NU) x OP
Expand Down Expand Up @@ -3392,6 +3450,11 @@ RBBILineMonkey::~RBBILineMonkey() {
delete fOP30;
delete fCP30;
delete fExtPictUnassigned;
delete fAK;
delete fAP;
delete fAS;
delete fVF;
delete fVI;

delete fCharBI;
delete fNumberMatcher;
Expand Down
11 changes: 11 additions & 0 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2195,3 +2195,14 @@ Bangkok)•</data>
[y] [a] [b] [c] [d] {600};
</rules>
<data>•ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600></data>

#
# Examples for line breaking at orthographic syllable boundaries
# From Unicode document L2/22-080R
#

<locale en>
<line>
<data>•\U00011F26•\U00011F02\U00011F2D•\U00011F26\U00011F42\U00011F26•\U00011F31\U00011F41•</data>
<data>•\u1BD7\u1BEC•\u1BD2\u1BEA\u1BC9\u1BF3•\u1BC2\u1BE7\u1BC9\u1BF3•</data>
<data>•\u1B18•\u1B27\u1B44\u200C\u1B2B\u1B38•\u1B31\u1B44\u1B1D\u1B36•</data>

0 comments on commit 36438a9

Please sign in to comment.