From d26a3214a16d039ba24459c3c9200988e643d60e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 4 Jul 2024 01:31:31 +0200 Subject: [PATCH] ICU-22707 Rules that seem to actually work. (Needs tweaks to comments and some squashing.) --- icu4c/source/data/brkitr/rules/line.txt | 114 +++++++++++------------- 1 file changed, 50 insertions(+), 64 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 3643d1a7ac00..e40bdd938ba8 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -122,7 +122,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 [$QU - \p{Pi}] $BA $HY $NS $IN $NU $PR $PO $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # @@ -275,68 +275,50 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 -$LB18NonBreaks $CM* [$QU - \p{Pi}]; -^$CM+ [$QU - \p{Pi}]; - -[$QU - \p{Pf}] $CM* .; - -# LB 19a -# × QU ( [^$EastAsian] | eot ) -$LB18NonBreaks $CM* $QU $CM* [[^ \p{ea=F}\p{ea=W}\p{ea=H} $CM] {eof}]; -^$CM+ $QU $CM* [[^ \p{ea=F}\p{ea=W}\p{ea=H} $CM] {eof}]; - -# QU × [^$EastAsian] -($QU $CM*)+ [^\p{ea=F}\p{ea=W}\p{ea=H} $CM]; - -# [^$EastAsian] × QU -# ( sot | [^$EastAsian] ) QU × -[$LB18NonBreaks - [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $QU $CM* .; -^[$CM - [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $QU $CM* .; -^ $QU $CM* .; - -# LB19a chaining: - -# LB19a can be chained into from a large number of rules, including itself; -# it can also chain into LB15a on QU Pi. -# In the rules below, the expression -# ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) -# covers chaining on LB15a. -# TODO(egg): In addition, it can be chained into from LB15a followed by a CM, and in particular an LB19a-LB15a chain is possible. - -# Note: all lb=QU are outside [\p{ea=F}\p{ea=W}\p{ea=H}]. This takes -# care of the pure self-chaining of LB19a. -$LB18NonBreaks $CM* $QU $CM* ($QU $CM*)+ .; -^$CM+ $QU $CM* ($QU $CM*)+ .; -^ $QU $CM* ($QU $CM*)+ .; -# Chaining into LB15a: -$LB18NonBreaks $CM* $QU $CM* ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?); -^$CM+ $QU $CM* ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?); -^ $QU $CM* ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?); - -# Chaining on Pf QU from LB 15b: -$LB8NonBreaks [\p{Pf} & $QU] $CM* ( ($QU $CM*)+ . | ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -$CAN_CM $CM* [\p{Pf} & $QU] $CM* ( ($QU $CM*)+ . | ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -^$CM+ [\p{Pf} & $QU] $CM* ( ($QU $CM*)+ . | ($QU $CM*)* ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); - -# SP is never [$EastAsian], but can be non-breaking, in which case we need to manually chain on SP QU: -# Non-breaking SP from LB14: -$OP $CM* $SP+ ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -# Non-breaking SP from LB15a: -($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -# Non-breaking SP from LB15a following LB15b: -$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -# Chaining on RI QU from LB30a: -$RI $CM* $RI $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -# Chaining on IS QU from LB15d: -$SP $IS $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); - -# Non-breaking SP from LB14 followed by CM: -$OP $CM* $SP+ [ $CM - [\p{ea=F}\p{ea=W}\p{ea=H}] ] $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); -$OP $CM* $SP+ $CM+ $QU $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | $SP $CM+ $AL_FOLLOW?) ); - +$LB18NonBreaks $CM* $QU; +^$CM+ $QU; + +# OP and GL are subtracted because of LB14 and LB12 (there is no break after them). +# BA is subtracted because of LB21a: +# We must not poke a hole into HL U+3000 × [\p{Pi} & QU] [\p{ea=F}\p{ea=W}\p{ea=H}], +# where U+3000 is lb=BA and ea=W. +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; + +$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; + +$QU $CM* .; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; + +$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; + + +^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; + +$OP $CM* $SP+ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; + +$SP? $IS $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; # LB 20 # $CB @@ -344,6 +326,11 @@ $OP $CM* $SP+ $CM+ $QU $CM* ($QU $CM*)+ ( . | ([\p{Pi} & $QU] $CM* $SP*)+ ( . | # $LB20NonBreaks = [$LB18NonBreaks - $CB]; +[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$CB $CM* $ZWJ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +($RI $CM*)? $RI $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; + # LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. # Originally added as a Finnish tailoring, now promoted to default ICU behavior. # Note: this is not default UAX-14 behaviour. See issue ICU-8151. @@ -355,7 +342,6 @@ $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: $OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB15a: -# TODO(egg): needs to be chained into the LB15as on the tail of a LB19. ($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; ^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB15a following LB15b: