diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index de9bc8e33a54..538e3865f3e0 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -23,21 +23,11 @@ !!chain; !!quoted_literals_only; -# Prototype definitions for L2/22-080R -# Line breaking at orthographic syllable boundaries - -$AK = [\u1B05-\u1B33 \u1B45-\u1B4C \u25CC \uA984-\uA9B2 \U00011005-\U00011037 \U00011071-\U00011072 \U00011075 \U00011305-\U0001130C \U0001130F-\U00011310 \U00011313-\U00011328 \U0001132A-\U00011330 \U00011332-\U00011333 \U00011335-\U00011339 \U00011360-\U00011361 \U00011392-\U000113B5 \U00011F04-\U00011F10 \U00011F12-\U00011F33]; - -$AP = [\U00011003-\U00011004 \U000113D1 \U00011F02]; - -$AS = [\u1BC0-\u1BE5 \uAA00-\uAA28 \U00011066-\U0001106F \U00011350 \U0001135E-\U0001135F \U00011380-\U00011389 \U0001138B \U0001138E \U00011390-\U00011391 \U00011EE0-\U00011EF1 \U00011F50-\U00011F59]; - -$VF = [\u1BF2-\u1BF3]; - -$VI = [\u1B44 \uA9C0 \U00011046 \U0001134D \U000113D0 \U00011F42]; - $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; @@ -77,6 +67,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -95,7 +87,7 @@ $ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. -$CM = [[[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]] - [$VF $VI]]; +$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]]; $CMX = [[$CM] - [$ZWJ]]; # Dictionary character set, for triggering language-based break engines. Currently @@ -110,7 +102,7 @@ $dictionary = [$SA]; # XX (Unknown, unassigned) # as $AL (Alphabetic) # -$ALPlus = [[$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]] - [$AS $AK $AP]]; +$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]]; ## ------------------------------------------------- @@ -228,7 +220,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -238,7 +250,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -248,9 +260,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 ($CL | $CP) $CM* $SP* $NS; @@ -351,8 +360,8 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL -#LB 28b Do not break Orthographic syllables -($AP $CM*)? ($AS | $AK) ($CM* $VI $CM* $AK)* ($CM* $VI | (($CM* $AS | $CM* $AK)? $CM* $VF))?; +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt index e3fabc29b4e8..b5ba6d629660 100644 --- a/icu4c/source/data/brkitr/rules/line_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_cj.txt @@ -25,7 +25,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; @@ -65,6 +68,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -216,7 +221,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -226,7 +251,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -236,9 +261,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 ($CL | $CP) $CM* $SP* $NS; @@ -339,6 +361,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index ca76ecc9a2f3..37a8e0e84607 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -30,7 +30,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; @@ -71,6 +74,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -222,7 +227,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -232,7 +257,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -242,9 +267,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 # Do not break between closing punctuation and $NS, even with intervening spaces # But DO allow a break between closing punctuation and $NSX, don't include it here @@ -349,6 +371,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 0be1e7cb0c81..1794e77087eb 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -38,7 +38,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BAX = [\u2010 \u2013]; $BA = [[:LineBreak = Break_After:] - $BAX]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. @@ -83,6 +86,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -234,7 +239,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -244,7 +269,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -254,9 +279,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 # Do not break between closing punctuation and $NS, even with intervening spaces # But DO allow a break between closing punctuation and $NSX, don't include it here @@ -367,6 +389,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt index 8b37d5acb368..811f4ef71db9 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt @@ -40,7 +40,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BAX = [\u2010 \u2013]; $BA = [[:LineBreak = Break_After:] - $BAX]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. @@ -85,6 +88,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -247,7 +252,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -257,7 +282,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -267,9 +292,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 # Do not break between closing punctuation and $NS, even with intervening spaces # But DO allow a break between closing punctuation and $NSX, don't include it here @@ -380,6 +402,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index 70acdb4aed67..a5c6a3dcaa4e 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -26,7 +26,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; @@ -66,6 +69,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -217,7 +222,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -227,7 +252,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -237,9 +262,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 ($CL | $CP) $CM* $SP* $NS; @@ -340,6 +362,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index 7ed8b35081ab..f9a1e652a641 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -28,7 +28,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; @@ -69,6 +72,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -220,7 +225,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -230,7 +255,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -240,9 +265,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 # Do not break between closing punctuation and $NS, even with intervening spaces # But DO allow a break between closing punctuation and $NSX, don't include it here @@ -346,6 +368,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt index 1aeafdf8028d..c8063db60bfe 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt @@ -30,7 +30,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; @@ -71,6 +74,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -233,7 +238,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -243,7 +268,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -253,9 +278,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 # Do not break between closing punctuation and $NS, even with intervening spaces # But DO allow a break between closing punctuation and $NSX, don't include it here @@ -359,6 +381,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt index 290b9b8c83a7..e23de7474c63 100644 --- a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt @@ -26,7 +26,10 @@ !!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; +$AK = [:LineBreak = Aksara:]; $AL = [:LineBreak = Alphabetic:]; +$AP = [:LineBreak = Aksara_Prebase:]; +$AS = [:LineBreak = Aksara_Start:]; $BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; @@ -66,6 +69,8 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$VF = [:LineBreak = Virama_Final:]; +$VI = [:LineBreak = Virama:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -228,7 +233,27 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # by rule 8, CM following a SP is stand-alone. -# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# LB 15a +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ .; +($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^([\p{Pi} & $QU] $CM* $SP*)+ .; +^([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + +# LB 15b +$LB8NonBreaks [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; +^$CM+ [\p{Pf} & $QU] $CM* [$SP $GL $WJ $CL $QU $CP $EX $IS $SY $BK $CR $LF $NL $ZW {eof}]; + +# Messy interaction: manually chain between LB 15b and LB 15a on Pf Pi. +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ .; +^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP $CM+ $AL_FOLLOW?; + + +# LB 15c Force a break before start of a number with a leading decimal pt, e.g. " .23" # Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. # See issue ICU-20303 @@ -238,7 +263,7 @@ $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # -# LB 14b Do not break before numeric separators (IS), even after spaces. +# LB 15d Do not break before numeric separators (IS), even after spaces. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -248,9 +273,6 @@ $CAN_CM $CM* $IS; ^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -# LB 15 -$QU $CM* $SP* $OP; - # LB 16 ($CL | $CP) $CM* $SP* $NS; @@ -351,6 +373,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); ($ALPlus | $HL) $CM* ($ALPlus | $HL); ^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL +#LB 28a Do not break Orthographic syllables +($AP $CM*)? ($AS | $AK | [◌] ) ($CM* $VI $CM* ($AK | [◌] ))* ($CM* $VI | (($CM* ($AS | $AK | [◌] ) )? $CM* $VF))?; + # LB 29 $IS $CM* ($ALPlus | $HL); diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 8d4ddfe7bd9c..5d54b47ecb17 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -15,6 +15,7 @@ #if !UCONFIG_NO_BREAK_ITERATION #include +#include #include #include #include @@ -1453,21 +1454,29 @@ void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, pos = bi->first(); pos = bi->next(); + bool error = false; + std::set actualBreaks; + std::set expectedBreaks; while (pos != BreakIterator::DONE) { + actualBreaks.insert(pos); if (expectedI >= breakPositions->size()) { errln("Test file \"%s\", line %d, unexpected break found at position %d", testFileName, lineNumber, pos); + error = true; break; } expectedPos = breakPositions->elementAti(expectedI); + expectedBreaks.insert(expectedPos); if (pos < expectedPos) { - errln("Test file \"%s\", line %d, unexpected break found at position %d", - testFileName, lineNumber, pos); + errln("Test file \"%s\", line %d, unexpected break found at position %d", testFileName, + lineNumber, pos); + error = true; break; } if (pos > expectedPos) { errln("Test file \"%s\", line %d, failed to find expected break at position %d", - testFileName, lineNumber, expectedPos); + testFileName, lineNumber, expectedPos); + error = true; break; } pos = bi->next(); @@ -1475,8 +1484,32 @@ void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, } if (pos==BreakIterator::DONE && expectedIsize()) { - errln("Test file \"%s\", line %d, failed to find expected break at position %d", - testFileName, lineNumber, breakPositions->elementAti(expectedI)); + errln("Test file \"%s\", line %d, failed to find expected break at position %d", testFileName, + lineNumber, breakPositions->elementAti(expectedI)); + error = true; + } + + if (error) { + for (; pos != BreakIterator::DONE; pos = bi->next()) { + actualBreaks.insert(pos); + } + for (; expectedI < breakPositions->size(); ++expectedI) { + expectedBreaks.insert(breakPositions->elementAti(expectedI)); + } + UnicodeString expected; + UnicodeString actual; + for (int32_t i = 0; i < testString.length();) { + const UChar32 c = testString.char32At(i); + i += U16_LENGTH(c); + expected += expectedBreaks.count(i) == 1 ? "÷" : "×"; + actual += actualBreaks.count(i) == 1 ? "÷" : "×"; + expected += c; + actual += c; + } + expected += expectedBreaks.count(testString.length()) == 1 ? "÷" : "×"; + actual += actualBreaks.count(testString.length()) == 1 ? "÷" : "×"; + errln("Expected : " + expected); + errln("Actual : " + actual); } } @@ -2662,6 +2695,8 @@ class RBBILineMonkey: public RBBIMonkeyKind { UnicodeSet *fAS; UnicodeSet *fVF; UnicodeSet *fVI; + UnicodeSet *fPi; + UnicodeSet *fPf; BreakIterator *fCharBI; const UnicodeString *fText; @@ -2732,15 +2767,14 @@ RBBILineMonkey::RBBILineMonkey() : fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status); - fAK = new UnicodeSet( - u"[\u1B05-\u1B33\u1B45-\u1B4C\u25CC\uA984-\uA9B2\U00011005-\U00011037\U00011071-\U00011072\U00011075\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U00011360-\U00011361\U00011392-\U000113B5\U00011F04-\U00011F10\U00011F12-\U00011F33]", - status); - fAP = new UnicodeSet(u"[\U00011003-\U00011004\U000113D1\U00011F02]", status); - fAS = new UnicodeSet( - u"[\u1BC0-\u1BE5\uAA00-\uAA28\U00011066-\U0001106F\U00011350\U0001135E-\U0001135F\U00011380-\U00011389\U0001138B\U0001138E\U00011390-\U00011391\U00011EE0-\U00011EF1\U00011F50-\U00011F59]", - status); - fVF = new UnicodeSet(u"[\u1BF2-\u1BF3]", status); - fVI = new UnicodeSet(u"[\u1B44\uA9C0\U00011046\U0001134D\U000113D0\U00011F42]", status); + fAK = new UnicodeSet(uR"([\p{Line_Break=AK}])", status); + fAP = new UnicodeSet(uR"([\p{Line_Break=AP}])", status); + fAS = new UnicodeSet(uR"([\p{Line_Break=AS}])", status); + fVF = new UnicodeSet(uR"([\p{Line_Break=VF}])", status); + fVI = new UnicodeSet(uR"([\p{Line_Break=VI}])", status); + + fPi = new UnicodeSet(uR"([\p{Pi}])", status); + fPf = new UnicodeSet(uR"([\p{Pf}])", status); if (U_FAILURE(status)) { deferredStatus = status; @@ -2756,12 +2790,6 @@ RBBILineMonkey::RBBILineMonkey() : fHH->add(u'\u2010'); // Hyphen, '‐' - fAL->removeAll(*fAK); - fAL->removeAll(*fAP); - fAL->removeAll(*fAS); - fCM->removeAll(*fVF); - fCM->removeAll(*fVI); - // Sets and names. fSets->addElement(fBK, status); classNames.push_back("fBK"); fSets->addElement(fCR, status); classNames.push_back("fCR"); @@ -2814,10 +2842,7 @@ RBBILineMonkey::RBBILineMonkey() : fSets->addElement(fVI, status); classNames.push_back("fVI"); - // Hack for orthographic syllable prototype, to adjust CM property for use in numeric regexp. - // Note that 200d adjustment is permanent. - - UnicodeString CMx {u"[[[\\p{Line_Break=CM}]\\u200d]-[\\u1BF2-\\u1BF3\\u1B44\\uA9C0\\U00011046\\U0001134D\\U000113D0\\U00011F42]]"}; + UnicodeString CMx {uR"([[\p{Line_Break=CM}]\u200d])"}; UnicodeString rules; rules = rules + u"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(" + CMx + u")*)?" + u"((\\p{Line_Break=OP}|\\p{Line_Break=HY})(" + CMx + u")*)?" @@ -3119,39 +3144,72 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } - - if (nextPos < fText->length()) { - // note: UnicodeString::char32At(length) returns ffff, not distinguishable - // from a legit ffff character. So test length separately. - UChar32 nextChar = fText->char32At(nextPos); - if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) { - setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space"); - break; + // Same as LB 14, scan backward for + // (sot | BK | CR | LF | NL | OP CM*| QU CM* | GL CM* | SP) [\p{Pi}&QU] CM* SP*. + tPos = prevPos; + // SP* (with the aforementioned Twist). + if (fSP->contains(prevChar)) { + while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); } } - - - if (fIS->contains(thisChar)) { - setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces."); - continue; + // CM*. + while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); } - - - if (fOP->contains(thisChar)) { - // Scan backwards from prevChar to see if it is preceded by QU CM* SP* - int tPos = prevPos; - while (tPos>0 && fSP->contains(fText->char32At(tPos))) { + // [\p{Pi}&QU]. + if (fPi->contains(fText->char32At(tPos)) && fQU->contains(fText->char32At(tPos))) { + if (tPos == 0) { + setAppliedRule(pos, "LB 15a sot [\\p{Pi}&QU] SP* ×"); + continue; + } else { tPos = fText->moveIndex32(tPos, -1); + if (fBK->contains(fText->char32At(tPos)) || fCR->contains(fText->char32At(tPos)) || + fLF->contains(fText->char32At(tPos)) || fNL->contains(fText->char32At(tPos)) || + fSP->contains(fText->char32At(tPos)) || fZW->contains(fText->char32At(tPos))) { + setAppliedRule(pos, "LB 15a (BK | CR | LF | NL | SP | ZW) [\\p{Pi}&QU] SP* ×"); + continue; + } } - while (tPos>0 && fCM->contains(fText->char32At(tPos))) { + // CM*. + while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { tPos = fText->moveIndex32(tPos, -1); } - if (fQU->contains(fText->char32At(tPos))) { - setAppliedRule(pos, "LB 15 QU SP* x OP"); + if (fOP->contains(fText->char32At(tPos)) || fQU->contains(fText->char32At(tPos)) || + fGL->contains(fText->char32At(tPos))) { + setAppliedRule(pos, "LB 15a (OP | QU | GL) [\\p{Pi}&QU] SP* ×"); continue; } } + if (fPf->contains(thisChar) && fQU->contains(thisChar)) { + UChar32 nextChar = fText->char32At(nextPos); + if (nextPos == fText->length() || fSP->contains(nextChar) || fGL->contains(nextChar) || + fWJ->contains(nextChar) || fCL->contains(nextChar) || fQU->contains(nextChar) || + fCP->contains(nextChar) || fEX->contains(nextChar) || fIS->contains(nextChar) || + fSY->contains(nextChar) || fBK->contains(nextChar) || fCR->contains(nextChar) || + fLF->contains(nextChar) || fNL->contains(nextChar) || fZW->contains(nextChar)) { + setAppliedRule(pos, "LB 15b × [\\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS | SY " + "| BK | CR | LF | NL | ZW | eot)"); + continue; + } + } + + if (nextPos < fText->length()) { + // note: UnicodeString::char32At(length) returns ffff, not distinguishable + // from a legit ffff noncharacter. So test length separately. + UChar32 nextChar = fText->char32At(nextPos); + if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) { + setAppliedRule(pos, + "LB 15c Break before an IS that begins a number and follows a space"); + break; + } + } + + if (fIS->contains(thisChar)) { + setAppliedRule(pos, "LB 15d Do not break before numeric separators, even after spaces."); + continue; + } // Scan backwards for SP* CM* (CL | CP) if (fNS->contains(thisChar)) { @@ -3313,36 +3371,38 @@ int32_t RBBILineMonkey::next(int32_t startPos) { } - if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\")."); continue; } - if (fAP->contains(prevChar) && (fAK->contains(thisChar) || fAS->contains(thisChar))) { - setAppliedRule(pos, "LB 28b.1 AP x (AK | AS)"); + if (fAP->contains(prevChar) && + (fAK->contains(thisChar) || thisChar == U'◌' || fAS->contains(thisChar))) { + setAppliedRule(pos, "LB 28a.1 AP x (AK | ◌ | AS)"); continue; } - if ((fAK->contains(prevChar) || fAS->contains(prevChar)) && + if ((fAK->contains(prevChar) || prevChar == U'◌' || fAS->contains(prevChar)) && (fVF->contains(thisChar) || fVI->contains(thisChar))) { - setAppliedRule(pos, "LB 28b.2 (AK | AS) x (VF | VI)"); + setAppliedRule(pos, "LB 28a.2 (AK | ◌ | AS) x (VF | VI)"); continue; } - if ((fAK->contains(prevCharX2) || fAS->contains(prevCharX2)) && - fVI->contains(prevChar) && fAK->contains(thisChar)) { - setAppliedRule(pos, "LB 28b.3 (AK | AS) VI x AK"); + if ((fAK->contains(prevCharX2) || prevCharX2 == U'◌' || fAS->contains(prevCharX2)) && + fVI->contains(prevChar) && + (fAK->contains(thisChar) || thisChar == U'◌')) { + setAppliedRule(pos, "LB 28a.3 (AK | ◌ | AS) VI x (AK | ◌)"); continue; } if (nextPos < fText->length()) { // note: UnicodeString::char32At(length) returns ffff, not distinguishable - // from a legit ffff character. So test length separately. + // from a legit ffff noncharacter. So test length separately. UChar32 nextChar = fText->char32At(nextPos); - if ((fAK->contains(prevChar) || fAS->contains(prevChar)) && - (fAK->contains(thisChar) || fAS->contains(thisChar)) && fVF->contains(nextChar)) { - setAppliedRule(pos, "LB 28b.4 (AK | AS) x (AK | AS) VF"); + if ((fAK->contains(prevChar) || prevChar == U'◌' || fAS->contains(prevChar)) && + (fAK->contains(thisChar) || thisChar == U'◌' || fAS->contains(thisChar)) && + fVF->contains(nextChar)) { + setAppliedRule(pos, "LB 28a.4 (AK | ◌ | AS) x (AK | ◌ | AS) VF"); continue; } } @@ -3455,6 +3515,8 @@ RBBILineMonkey::~RBBILineMonkey() { delete fAS; delete fVF; delete fVI; + delete fPi; + delete fPf; delete fCharBI; delete fNumberMatcher; @@ -4269,7 +4331,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name "Break found but not expected"), name, i, seed); - for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) { + for (ci = startContext;; (ci = testText.moveIndex32(ci, 1))) { UChar32 c; c = testText.char32At(ci); diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index 409f223ed6f4..c68a66b95f20 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -24,7 +24,10 @@ locale = en; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -63,6 +66,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -75,6 +80,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -106,19 +114,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -205,6 +221,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4c/source/test/testdata/break_rules/line_cj.txt b/icu4c/source/test/testdata/break_rules/line_cj.txt index e66b0d134ada..bcfc94f05485 100644 --- a/icu4c/source/test/testdata/break_rules/line_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_cj.txt @@ -17,7 +17,10 @@ locale = zh; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -56,6 +59,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -75,6 +80,9 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -106,19 +114,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -205,6 +221,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt index 54873a489698..05c2bea74025 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -24,7 +24,10 @@ locale = en@lb=loose; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -64,6 +67,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -76,6 +81,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -107,19 +115,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -207,6 +223,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 9d7b2cc12c6d..93a06be94310 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -38,7 +38,10 @@ locale = ja@lb=loose; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BAX = [\u2010 \u2013]; BA = [[:LineBreak = Break_After:] - BAX]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. @@ -82,6 +85,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -94,6 +99,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -126,19 +134,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -231,6 +247,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt index c718cc2f44c1..0397ec5a5f2b 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -26,7 +26,10 @@ type = line; locale = en@lb=normal; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -65,6 +68,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -77,6 +82,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -108,19 +116,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -207,6 +223,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index a4e1428c2b2e..04889a31ca4e 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -27,7 +27,10 @@ type = line; locale = ja@lb=normal; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -67,6 +70,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -79,6 +84,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -110,19 +118,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; # Do not break between closing punctuation and $NS, even with intervening spaces # But DO allow a break between closing punctuation and $NSX, don't include it here @@ -211,6 +227,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 175ec7966411..772acf24363c 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -2206,3 +2206,13 @@ Bangkok)• •\U00011F26•\U00011F02\U00011F2D•\U00011F26\U00011F42\U00011F26•\U00011F31\U00011F41• •\u1BD7\u1BEC•\u1BD2\u1BEA\u1BC9\u1BF3•\u1BC2\u1BE7\u1BC9\u1BF3• •\u1B18•\u1B27\u1B44\u200C\u1B2B\u1B38•\u1B31\u1B44\u1B1D\u1B36• + +# Line breaking around quotation marks (LB 15a and LB 15b). + + +•Some •« basic » •quoting• +•Some •»German« •quoting• +•( « bracketed » ) •quoting• +•« « Nesting » »• +•« Complex »« chaining » • +•« .618 »• # Interaction with the ICU tailoring to break before such numbers. \ No newline at end of file diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index cd832e89e145..de8c54a72393 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43e6290edfaef19328dfa54c5fc301b36c57489cec9658c5eeb13a186e8eaad7 -size 14340466 +oid sha256:c1952ee8136d59a785d51caf70e7f56ca477ed036e83465bafddf3ed8c86e5b5 +size 14351596 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 1ecdf571e51a..f585f17a6fa1 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d0821499d99b54fa4bb80a5aaca12cd07cc71aea8a372e7c273691891973cb9 -size 94829 +oid sha256:cbdbc0dabb200a00781ef7423cf2ad5a8fd341e1b47897477bfd64dc03537731 +size 94837 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 22664d934c8d..41facb5f388e 100644 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8bd64e37315224765b48d7a527aacbff04c3a1227d2378e7c822e5e1a49b568 -size 831983 +oid sha256:ffc267e9bd1a9c95cbaf261de6c50856ba8d848d7198f18458d06b82e8536c7b +size 834969 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 7a30389d4679..ce45e1c13997 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -748,6 +748,13 @@ public boolean contains(int codePoint) { XUnicodeSet fOP30; XUnicodeSet fCP30; XUnicodeSet fExtPictUnassigned; + XUnicodeSet fAK; + XUnicodeSet fAP; + XUnicodeSet fAS; + XUnicodeSet fVF; + XUnicodeSet fVI; + XUnicodeSet fPi; + XUnicodeSet fPf; StringBuffer fText; int fOrigPositions; @@ -802,6 +809,14 @@ public boolean contains(int codePoint) { fOP30 = new XUnicodeSet("[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"); fCP30 = new XUnicodeSet("[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"); fExtPictUnassigned = new XUnicodeSet("[\\p{Extended_Pictographic}&\\p{Cn}]"); + fAK = new XUnicodeSet("[\\p{Line_Break=AK}]"); + fAP = new XUnicodeSet("[\\p{Line_Break=AP}]"); + fAS = new XUnicodeSet("[\\p{Line_Break=AS}]"); + fVF = new XUnicodeSet("[\\p{Line_Break=VF}]"); + fVI = new XUnicodeSet("[\\p{Line_Break=VI}]"); + + fPi = new XUnicodeSet("[\\p{Pi}]"); + fPf = new XUnicodeSet("[\\p{Pf}]"); // Remove dictionary characters. // The monkey test reference implementation of line break does not replicate the dictionary behavior, @@ -863,6 +878,11 @@ public boolean contains(int codePoint) { fSets.add(fOP30); fClassNames.add("OP30"); fSets.add(fCP30); fClassNames.add("CP30"); fSets.add(fExtPictUnassigned); fClassNames.add("fExtPictUnassigned"); + fSets.add(fAK); fClassNames.add("AK"); + fSets.add(fAP); fClassNames.add("AP"); + fSets.add(fAS); fClassNames.add("AS"); + fSets.add(fVF); fClassNames.add("VF"); + fSets.add(fVI); fClassNames.add("VI"); } @Override @@ -1095,34 +1115,69 @@ int next(int startPos) { continue; } - if (nextPos < fText.length()) { - int nextChar = fText.codePointAt(nextPos); - if (fSP.contains(prevChar) && fIS.contains(thisChar) && fNU.contains(nextChar)) { - setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space"); - break; + // Same as LB 14, scan backward for + // (sot | BK | CR | LF | NL | OP CM*| QU CM* | GL CM* | SP) [\p{Pi}&QU] CM* SP*. + tPos = prevPos; + // SP* (with the aforementioned Twist). + if (fSP.contains(prevChar)) { + while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { + tPos = moveIndex32(fText, tPos, -1); } } - - if (fIS.contains(thisChar)) { - setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces"); - continue; + // CM*. + while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { + tPos = moveIndex32(fText, tPos, -1); } - - if (fOP.contains(thisChar)) { - // Scan backwards from prevChar to see if it is preceded by QU CM* SP* - tPos = prevPos; - while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { + // [\p{Pi}&QU]. + if (fPi.contains(UTF16.charAt(fText, tPos)) && fQU.contains(UTF16.charAt(fText, tPos))) { + if (tPos == 0) { + setAppliedRule(pos, "LB 15a sot [\\p{Pi}&QU] SP* ×"); + continue; + } else { tPos = moveIndex32(fText, tPos, -1); + if (fBK.contains(UTF16.charAt(fText, tPos)) || fCR.contains(UTF16.charAt(fText, tPos)) || + fLF.contains(UTF16.charAt(fText, tPos)) || fNL.contains(UTF16.charAt(fText, tPos)) || + fSP.contains(UTF16.charAt(fText, tPos)) || fZW.contains(UTF16.charAt(fText, tPos))) { + setAppliedRule(pos, "LB 15a (BK | CR | LF | NL | SP | ZW) [\\p{Pi}&QU] SP* ×"); + continue; + } } + // CM*. while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { tPos = moveIndex32(fText, tPos, -1); } - if (fQU.contains(UTF16.charAt(fText, tPos))) { - setAppliedRule(pos, "LB 15 QU SP* x OP"); + if (fOP.contains(UTF16.charAt(fText, tPos)) || fQU.contains(UTF16.charAt(fText, tPos)) || + fGL.contains(UTF16.charAt(fText, tPos))) { + setAppliedRule(pos, "LB 15a (OP | QU | GL) [\\p{Pi}&QU] SP* ×"); + continue; + } + } + + if (fPf.contains(thisChar) && fQU.contains(thisChar)) { + int nextChar = UTF16.charAt(fText, nextPos); + if (nextPos == fText.length() || fSP.contains(nextChar) || fGL.contains(nextChar) || + fWJ.contains(nextChar) || fCL.contains(nextChar) || fQU.contains(nextChar) || + fCP.contains(nextChar) || fEX.contains(nextChar) || fIS.contains(nextChar) || + fSY.contains(nextChar) || fBK.contains(nextChar) || fCR.contains(nextChar) || + fLF.contains(nextChar) || fNL.contains(nextChar) || fZW.contains(nextChar)) { + setAppliedRule(pos, "LB 15b × [\\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | eot)"); continue; } } + if (nextPos < fText.length()) { + int nextChar = fText.codePointAt(nextPos); + if (fSP.contains(prevChar) && fIS.contains(thisChar) && fNU.contains(nextChar)) { + setAppliedRule(pos, "LB 15c Break before an IS that begins a number and follows a space"); + break; + } + } + + if (fIS.contains(thisChar)) { + setAppliedRule(pos, "LB 15d Do not break before numeric separators, even after spaces"); + continue; + } + if (fNS.contains(thisChar)) { tPos = prevPos; while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { @@ -1282,6 +1337,37 @@ int next(int startPos) { continue; } + if (fAP.contains(prevChar) && + (fAK.contains(thisChar) || thisChar == '◌' || fAS.contains(thisChar))) { + setAppliedRule(pos, "LB 28a.1 AP x (AK | ◌ | AS)"); + continue; + } + + if ((fAK.contains(prevChar) || prevChar == '◌' || fAS.contains(prevChar)) && + (fVF.contains(thisChar) || fVI.contains(thisChar))) { + setAppliedRule(pos, "LB 28a.2 (AK | ◌ | AS) x (VF | VI)"); + continue; + } + + if ((fAK.contains(prevCharX2) || prevCharX2 == '◌' || fAS.contains(prevCharX2)) && + fVI.contains(prevChar) && + (fAK.contains(thisChar) || thisChar == '◌')) { + setAppliedRule(pos, "LB 28a.3 (AK | ◌ | AS) VI x (AK | ◌)"); + continue; + } + + if (nextPos < fText.length()) { + // note: UnicodeString::char32At(length) returns ffff, not distinguishable + // from a legit ffff noncharacter. So test length separately. + int nextChar = UTF16.charAt(fText, nextPos); + if ((fAK.contains(prevChar) || prevChar == '◌' || fAS.contains(prevChar)) && + (fAK.contains(thisChar) || thisChar == '◌' || fAS.contains(thisChar)) && + fVF.contains(nextChar)) { + setAppliedRule(pos, "LB 28a.4 (AK | ◌ | AS) x (AK | ◌ | AS) VF"); + continue; + } + } + if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics"); continue; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt index 409f223ed6f4..c68a66b95f20 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt @@ -24,7 +24,10 @@ locale = en; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -63,6 +66,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -75,6 +80,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -106,19 +114,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -205,6 +221,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt index e66b0d134ada..bcfc94f05485 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt @@ -17,7 +17,10 @@ locale = zh; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -56,6 +59,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -75,6 +80,9 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -106,19 +114,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -205,6 +221,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt index 54873a489698..05c2bea74025 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt @@ -24,7 +24,10 @@ locale = en@lb=loose; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -64,6 +67,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -76,6 +81,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -107,19 +115,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -207,6 +223,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 9d7b2cc12c6d..93a06be94310 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -38,7 +38,10 @@ locale = ja@lb=loose; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BAX = [\u2010 \u2013]; BA = [[:LineBreak = Break_After:] - BAX]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. @@ -82,6 +85,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -94,6 +99,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -126,19 +134,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -231,6 +247,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt index c718cc2f44c1..0397ec5a5f2b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt @@ -26,7 +26,10 @@ type = line; locale = en@lb=normal; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -65,6 +68,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -77,6 +82,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -108,19 +116,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; @@ -207,6 +223,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index a4e1428c2b2e..04889a31ca4e 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -27,7 +27,10 @@ type = line; locale = ja@lb=normal; AI = [:LineBreak = Ambiguous:]; +AK = [:LineBreak = Aksara:]; AL = [:LineBreak = Alphabetic:]; +AP = [:LineBreak = Aksara_Prebase:]; +AS = [:LineBreak = Aksara_Start:]; BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; @@ -67,6 +70,8 @@ SA = [:LineBreak = Complex_Context:]; SG = [:LineBreak = Surrogate:]; SP = [:LineBreak = Space:]; SY = [:LineBreak = Break_Symbols:]; +VF = [:LineBreak = Virama_Final:]; +VI = [:LineBreak = Virama:]; WJ = [:LineBreak = Word_Joiner:]; XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; @@ -79,6 +84,9 @@ ZWJ = [:LineBreak = ZWJ:]; OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +PiQU = [\p{Pi}&QU]; +PfQU = [\p{Pf}&QU]; + # The redundant-looking inner brackets are required for the current parser in the test code. ExtPictUnassigned = [[\p{Extended_Pictographic}]&[\p{Cn}]]; @@ -110,19 +118,27 @@ LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; # Rules LB14 - LB17. + +# Moved before LB14, because it matches a supersequence. +LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; + # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 # while the prefix of it, "OP CM SP" matches LB7.1 LB14: OP CM* SP* .; -# LB 14a Break before an IS that begins a number and follows a space. -LB14a: SP ÷ IS CM* NU; +LB15a.2: ^ (PiQU CM* SP*)+ .; +# LB15b/LB15a chaining. +LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .; +LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ ); -# LB14b × IS -LB14b.1: [^SP] CM* IS; -LB14b.2: SP IS; +# LB 15c Break before an IS that begins a number and follows a space. +LB15c: SP ÷ IS CM* NU; + +# LB15d × IS +LB15d.1: [^SP] CM* IS; +LB15d.2: SP IS; -LB15: QU CM* SP* OP; # Do not break between closing punctuation and $NS, even with intervening spaces # But DO allow a break between closing punctuation and $NSX, don't include it here @@ -211,6 +227,8 @@ LB27.2: PR CM* (JL | JV | JT | H2 | H3); # Unattached (leading) CM treated as AL. LB28: (AL | HL | CM)CM* (AL | HL); +LB28: (AP CM*)? (AS | AK | [◌] ) (CM* VI CM* (AK | [◌] ))* (CM* VI | ((CM* (AS | AK | [◌] ) )? CM* VF))?; + LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index c0b6d1a55e3c..772acf24363c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -2195,3 +2195,24 @@ Bangkok)• [y] [a] [b] [c] [d] {600}; •ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600> + +# +# Examples for line breaking at orthographic syllable boundaries +# From Unicode document L2/22-080R +# + + + +•\U00011F26•\U00011F02\U00011F2D•\U00011F26\U00011F42\U00011F26•\U00011F31\U00011F41• +•\u1BD7\u1BEC•\u1BD2\u1BEA\u1BC9\u1BF3•\u1BC2\u1BE7\u1BC9\u1BF3• +•\u1B18•\u1B27\u1B44\u200C\u1B2B\u1B38•\u1B31\u1B44\u1B1D\u1B36• + +# Line breaking around quotation marks (LB 15a and LB 15b). + + +•Some •« basic » •quoting• +•Some •»German« •quoting• +•( « bracketed » ) •quoting• +•« « Nesting » »• +•« Complex »« chaining » • +•« .618 »• # Interaction with the ICU tailoring to break before such numbers. \ No newline at end of file diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line.brk index 7b469b6f9d55..72e472bd8cee 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_cj.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_cj.brk index 67c640ab866c..1dd02adcc256 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_cj.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_cj.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose.brk index cc26a1955d4a..8747c8e89c17 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_cj.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_cj.brk index 2c6046153d66..04232dc65971 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_cj.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_cj.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_phrase_cj.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_phrase_cj.brk index c839a2a68348..ad6c08c20fb5 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_phrase_cj.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_loose_phrase_cj.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal.brk index e11724e3a15c..3f0b62a13c89 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_cj.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_cj.brk index 4541cab15816..2a6e2017ab0d 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_cj.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_cj.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_phrase_cj.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_phrase_cj.brk index d7bab641e771..3f03503bd334 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_phrase_cj.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_normal_phrase_cj.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_phrase_cj.brk b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_phrase_cj.brk index 41c651cb8832..bb2e891818a6 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_phrase_cj.brk and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/brkitr/line_phrase_cj.brk differ diff --git a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/translit/root.res b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/translit/root.res index 329fd9a06fb3..585541536266 100644 Binary files a/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/translit/root.res and b/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/icudt73b/translit/root.res differ diff --git a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/format.res b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/format.res index 7d261fb59c8c..fcdb6541e5c7 100644 Binary files a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/format.res and b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/format.res differ diff --git a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/te.res b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/te.res index 7a0da7210553..c711314ff2b7 100644 Binary files a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/te.res and b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/te.res differ diff --git a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testnorm.nrm b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testnorm.nrm index 636db116dd14..4552eb55519e 100644 Binary files a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testnorm.nrm and b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testnorm.nrm differ diff --git a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testtypes.res b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testtypes.res index de026c414020..91d278739d98 100644 Binary files a/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testtypes.res and b/icu4j/maven-build/maven-icu4j-test-datafiles/src/main/resources/com/ibm/icu/dev/data/testdata/testtypes.res differ