diff --git a/icu4c/source/data/brkitr/fi.txt b/icu4c/source/data/brkitr/fi.txt deleted file mode 100644 index e672992edb19..000000000000 --- a/icu4c/source/data/brkitr/fi.txt +++ /dev/null @@ -1,8 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml -fi{ - boundaries{ - word:process(dependency){"word_fi_sv.brk"} - } -} diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt index e9420c8c5ff3..a2eef17e6529 100644 --- a/icu4c/source/data/brkitr/rules/word.txt +++ b/icu4c/source/data/brkitr/rules/word.txt @@ -11,7 +11,7 @@ # These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # # Note: Updates to word.txt will usually need to be merged into -# word_POSIX.txt and word_fi_sv.txt also. +# word_POSIX.txt also. ############################################################################## # @@ -42,7 +42,7 @@ $ALetter = [\p{Word_Break = ALetter}]; $Single_Quote = [\p{Word_Break = Single_Quote}]; $Double_Quote = [\p{Word_Break = Double_Quote}]; $MidNumLet = [\p{Word_Break = MidNumLet}]; -$MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; +$MidLetter = [\p{Word_Break = MidLetter}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt index 3cd0556eea70..73ddc8dc19b3 100644 --- a/icu4c/source/data/brkitr/rules/word_POSIX.txt +++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt @@ -42,7 +42,7 @@ $ALetter = [\p{Word_Break = ALetter}]; $Single_Quote = [\p{Word_Break = Single_Quote}]; $Double_Quote = [\p{Word_Break = Double_Quote}]; $MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; -$MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; +$MidLetter = [\p{Word_Break = MidLetter} - [\:]]; $MidNum = [\p{Word_Break = MidNum} [.]]; $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4c/source/data/brkitr/rules/word_fi_sv.txt b/icu4c/source/data/brkitr/rules/word_fi_sv.txt deleted file mode 100644 index daf5b3554d7f..000000000000 --- a/icu4c/source/data/brkitr/rules/word_fi_sv.txt +++ /dev/null @@ -1,172 +0,0 @@ -# -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html -# Copyright (C) 2002-2016, International Business Machines Corporation -# and others. All Rights Reserved. -# -# file: word_fi_sv.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 -# -# Note: Updates to word.txt will usually need to be merged into -# word_fi_sv.txt also. - -############################################################################## -# -# Character class definitions from TR 29 -# -############################################################################## - -!!chain; -!!quoted_literals_only; - - -# -# Character Class Definitions. -# - -$Han = [:Han:]; - -$CR = [\p{Word_Break = CR}]; -$LF = [\p{Word_Break = LF}]; -$Newline = [\p{Word_Break = Newline}]; -$Extend = [\p{Word_Break = Extend}-$Han]; -$ZWJ = [\p{Word_Break = ZWJ}]; -$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; -$Format = [\p{Word_Break = Format}]; -$Katakana = [\p{Word_Break = Katakana}]; -$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; -$ALetter = [\p{Word_Break = ALetter}]; -$Single_Quote = [\p{Word_Break = Single_Quote}]; -$Double_Quote = [\p{Word_Break = Double_Quote}]; -$MidNumLet = [\p{Word_Break = MidNumLet}]; -$MidLetter = [\p{Word_Break = MidLetter}]; -$MidNum = [\p{Word_Break = MidNum}]; -$Numeric = [\p{Word_Break = Numeric}]; -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; -$WSegSpace = [\p{Word_Break = WSegSpace}]; -$Extended_Pict = [\p{Extended_Pictographic}]; - -$Hiragana = [:Hiragana:]; -$Ideographic = [\p{Ideographic}]; - - -# Dictionary character set, for triggering language-based break engines. Currently -# limited to LineBreak=Complex_Context. Note that this set only works in Unicode -# 5.0 or later as the definition of Complex_Context was corrected to include all -# characters requiring dictionary break. - -$Control = [\p{Grapheme_Cluster_Break = Control}]; -$HangulSyllable = [\uac00-\ud7a3]; -$ComplexContext = [:LineBreak = Complex_Context:]; -$KanaKanji = [$Han $Hiragana $Katakana]; -$dictionaryCJK = [$KanaKanji $HangulSyllable]; -$dictionary = [$ComplexContext $dictionaryCJK]; - -# TODO: check if handling of katakana in dictionary makes rules incorrect/void - -# leave CJK scripts out of ALetterPlus -$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; - - -## ------------------------------------------------- - -# Rule 3 - CR x LF -# -$CR $LF; - -# Rule 3c Do not break within emoji zwj sequences. -# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. -# -$ZWJ $Extended_Pict; - -# Rule 3d - Keep horizontal whitespace together. -# -$WSegSpace $WSegSpace; - -# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning -# of a region of Text. - -$ExFm = [$Extend $Format $ZWJ]; - -^$ExFm+; # This rule fires only when there are format or extend characters at the - # start of text, or immediately following another boundary. It groups them, in - # the event there are more than one. - -[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, - # with no special rule status value. - -$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but -$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. -$HangulSyllable {200}; -$Hebrew_Letter $ExFm* {200}; -$Katakana $ExFm* {400}; # note: these status values override those from rule 5 -$Hiragana $ExFm* {400}; # by virtue of being numerically larger. -$Ideographic $ExFm* {400}; # - -# -# rule 5 -# Do not break between most letters. -# -($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); - -# rule 6 and 7 -($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; - -# rule 7a -$Hebrew_Letter $ExFm* $Single_Quote {200}; - -# rule 7b and 7c -$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; - -# rule 8 - -$Numeric $ExFm* $Numeric; - -# rule 9 - -($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; - -# rule 10 - -$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); - -# rule 11 and 12 - -$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; - -# rule 13 -# to be consistent with $KanaKanji $KanaKanhi, changed -# from 300 to 400. -# See also TestRuleStatus in intltest/rbbiapts.cpp -$Katakana $ExFm* $Katakana {400}; - -# rule 13a/b - -$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) -$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) -$Numeric $ExFm* $ExtendNumLet {100}; # (13a) -$Katakana $ExFm* $ExtendNumLet {400}; # (13a) -$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) - -$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) -$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) -$ExtendNumLet $ExFm* $Numeric {100}; # (13b) -$ExtendNumLet $ExFm* $Katakana {400}; # (13b) - -# rules 15 - 17 -# Pairs of Regional Indicators stay together. -# With incoming rule chaining disabled by ^, this rule will match exactly two of them. -# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. -# -^$Regional_Indicator $ExFm* $Regional_Indicator; - -# special handling for CJK characters: chain for later dictionary segmentation -$HangulSyllable $HangulSyllable {200}; -$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found - -# Rule 999 -# Match a single code point if no other rule applies. -.; diff --git a/icu4c/source/data/brkitr/sv.txt b/icu4c/source/data/brkitr/sv.txt deleted file mode 100644 index 09cef533f290..000000000000 --- a/icu4c/source/data/brkitr/sv.txt +++ /dev/null @@ -1,8 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml -sv{ - boundaries{ - word:process(dependency){"word_fi_sv.brk"} - } -} diff --git a/icu4c/source/data/xml/brkitr/fi.xml b/icu4c/source/data/xml/brkitr/fi.xml deleted file mode 100644 index 5081e7cd4edf..000000000000 --- a/icu4c/source/data/xml/brkitr/fi.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - %icu; -] -> - - - - - - - - - - - - - - diff --git a/icu4c/source/data/xml/brkitr/sv.xml b/icu4c/source/data/xml/brkitr/sv.xml deleted file mode 100644 index 5f1566942d57..000000000000 --- a/icu4c/source/data/xml/brkitr/sv.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - %icu; -] -> - - - - - - - - - - - - - - diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 31897a19ba04..0d1623083b6b 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1964,7 +1964,7 @@ RBBIWordMonkey::RBBIWordMonkey() fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status); fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status); fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status); - fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]", status); + fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status); fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status); fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status); fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status); diff --git a/icu4c/source/test/testdata/break_rules/word.txt b/icu4c/source/test/testdata/break_rules/word.txt index 002d1af5780d..5ace30266c8f 100644 --- a/icu4c/source/test/testdata/break_rules/word.txt +++ b/icu4c/source/test/testdata/break_rules/word.txt @@ -29,7 +29,7 @@ ALetter = [\p{Word_Break = ALetter}]; Single_Quote = [\p{Word_Break = Single_Quote}]; Double_Quote = [\p{Word_Break = Double_Quote}]; MidNumLet = [\p{Word_Break = MidNumLet}]; -MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; +MidLetter = [\p{Word_Break = MidLetter}]; MidNum = [\p{Word_Break = MidNum}]; Numeric = [\p{Word_Break = Numeric}]; ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4c/source/test/testdata/break_rules/word_POSIX.txt b/icu4c/source/test/testdata/break_rules/word_POSIX.txt index fc4eedeb8bdc..7fdc1a1ee077 100644 --- a/icu4c/source/test/testdata/break_rules/word_POSIX.txt +++ b/icu4c/source/test/testdata/break_rules/word_POSIX.txt @@ -28,7 +28,7 @@ ALetter = [\p{Word_Break = ALetter}]; Single_Quote = [\p{Word_Break = Single_Quote}]; Double_Quote = [\p{Word_Break = Double_Quote}]; MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; -MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; +MidLetter = [\p{Word_Break = MidLetter} - [\:]]; MidNum = [\p{Word_Break = MidNum} [.]]; Numeric = [\p{Word_Break = Numeric}]; ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 7d77588ef977..3eb591576ef5 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1583,7 +1583,7 @@ Bangkok)• -•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •struct.field<200> \ +•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ •for<200> •CS<200>-•types<200>.• •\uFF92\uFF76\uFF9E<400> • •xx<200>@•yy<200>.• diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/fi.res b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/fi.res deleted file mode 100644 index 30775c77faae..000000000000 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/fi.res and /dev/null differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/res_index.res b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/res_index.res index d09052f87c6e..1def2eb4d0f2 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/res_index.res and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/res_index.res differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/sv.res b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/sv.res deleted file mode 100644 index 30775c77faae..000000000000 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/sv.res and /dev/null differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word.brk index e3c17fdcbc68..0b99e3ca97eb 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word_POSIX.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word_POSIX.brk index cada1d7983bf..cb01e5175979 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word_POSIX.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word_POSIX.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word_fi_sv.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word_fi_sv.brk deleted file mode 100644 index 0b99e3ca97eb..000000000000 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/word_fi_sv.brk and /dev/null differ diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index cabbfc50f438..2dabd5f2e890 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -404,7 +404,7 @@ static class RBBIWordMonkey extends RBBIMonkeyKind { fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); - fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]"); + fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word.txt index 002d1af5780d..5ace30266c8f 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word.txt @@ -29,7 +29,7 @@ ALetter = [\p{Word_Break = ALetter}]; Single_Quote = [\p{Word_Break = Single_Quote}]; Double_Quote = [\p{Word_Break = Double_Quote}]; MidNumLet = [\p{Word_Break = MidNumLet}]; -MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; +MidLetter = [\p{Word_Break = MidLetter}]; MidNum = [\p{Word_Break = MidNum}]; Numeric = [\p{Word_Break = Numeric}]; ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt index fc4eedeb8bdc..7fdc1a1ee077 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt @@ -28,7 +28,7 @@ ALetter = [\p{Word_Break = ALetter}]; Single_Quote = [\p{Word_Break = Single_Quote}]; Double_Quote = [\p{Word_Break = Double_Quote}]; MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; -MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; +MidLetter = [\p{Word_Break = MidLetter} - [\:]]; MidNum = [\p{Word_Break = MidNum} [.]]; Numeric = [\p{Word_Break = Numeric}]; ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 7d77588ef977..3eb591576ef5 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1583,7 +1583,7 @@ Bangkok)• -•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •struct.field<200> \ +•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ •for<200> •CS<200>-•types<200>.• •\uFF92\uFF76\uFF9E<400> • •xx<200>@•yy<200>.•