From 05668fadc187bbd1d6fdad56cc4649fd3bdf4dec Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Wed, 1 Nov 2023 11:12:57 +0530 Subject: [PATCH] Regenerate terminators, fix Amharic text Regenerate terminators using Unicode::Tussle Fixes issue #3 --- sentencex/terminators.py | 308 +++++++++++++++++++++------------------ test/unit/test_am.py | 2 +- 2 files changed, 166 insertions(+), 144 deletions(-) diff --git a/sentencex/terminators.py b/sentencex/terminators.py index eec99c3..a54911d 100644 --- a/sentencex/terminators.py +++ b/sentencex/terminators.py @@ -1,143 +1,165 @@ -# unicode code points with the \p{Sentence_Break=STerm} or \p{Sentence_Break=ATerm} properties that -# also have the \p{Terminal_Punctuation} property generated with Unicode::Tussle perl script and -# additional fullstops in unicode character sets : https://www.fileformat.info/info/unicode/char/search.htm?q=.& -# preview=entity -GLOBAL_SENTENCE_TERMINATORS = [ - "...", # Horizontal Ellipsis - "!", # Exclamation Mark - ".", # Full Stop - "?", # Question Mark - "։", # Armenian Full Stop - "؞", # Arabic Sign Sallallahou Alayhe Wasallam - "؟", # Arabic Question Mark - "۔", # Arabic Full Stop - "܀", # Syriac End of Paragraph - "܁", # Syriac Supralinear Colon - "܂", # Syriac Sublinear Colon - "߹", # Nko Symbol Doorye - "࠷", # Samarkan Letter Do - "࠹", # Samarkan Letter Jho - "࠽", # Samarkan Letter Ro - "࠾", # Samarkan Letter Lo - "।", # Devanagari Danda - "॥", # Devanagari Double Danda - "၊", # Myanmar Sign Myanmar Phrase Stop - "။", # Myanmar Sign Myanmar Paragraph - "።", # Ethiopic Full Stop - "፧", # Ethiopic Colon - "፨", # Ethiopic Preface Colon - "᙮", # Ethiopic Question Mark - "᜵", # Buginese Vowel Sign E - "᜶", # Buginese Vowel Sign O - "᠃", # Mongolian Full Stop - "᠉", # Mongolian Birga - "᥄", # Buhid Virama - "᥅", # Buhid Punctuation Mark - "᪨", # Tai Tham Consonant Sign Medial Ra - "᪩", # Tai Tham Consonant Sign Medial La - "᪪", # Tai Tham Consonant Sign La Taa - "᪫", # Tai Tham Sign Mai Sak - "᭚", # Balinese Pameneng - "᭛", # Balinese Musical Symbol Combining Jublag - "᭞", # Sundanese Padasan Agung - "᭟", # Sundanese Paneken - "᰻", # Buhid Pamudpod - "᰼", # Buhid Pamudpod Han - "᱾", # Limbu Question Mark - "᱿", # Limbu Exclamation Mark - "‼", # Double Exclamation Mark - "‽", # Interrobang - "⁇", # Double Question Mark - "⁈", # Question Exclamation Mark - "⁉", # Exclamation Question Mark - "⸮", # Reversed Question Mark - "⸼", # Armenian Parenthesis Right - "꓿", # Yi Punctuation Small Comma - "꘎", # Vai Comma - "꘏", # Vai Full Stop - "꛳", # Batak Apostrophe - "꛷", # Batak Pangolat - "꡶", # Lanna Punctation Phrase - "꡷", # Lanna Punctation Paragraph - "꣎", # Ol Chiki Punctuation Mucaad - "꣏", # Ol Chiki Punctuation Double - "꤯", # Chakma Sign Visarga - "꧈", # Balinese Musical Symbol Left-Hand Open Dug - "꧉", # Balinese Musical Symbol Right-Hand Open Dug - "꩝", # Cham Consonant Sign Final H - "꩞", # Cham Consonant Sign Glottal Stop - "꩟", # Cham Consonant Sign M - "꫰", # Tai Viet Mai Khit - "꫱", # Tai Viet Vowel Ia - "꯫", # Meetei Mayek Cheikhei - "﹒", # Small Full Stop - "﹖", # Small Question Mark - "﹗", # Small Exclamation Mark - "!", # Fullwidth Exclamation Mark - ".", # Fullwidth Full Stop - "?", # Fullwidth Question Mark - "ཕ", # Tibetan Letter Pha - "བ", # Tibetan Letter Ba - "བྷ", # Tibetan Letter Bha - "མ", # Tibetan Letter Ma - "ཙ", # Tibetan Letter Tsa - "၇", # Myanmar Digit Seven - "၈", # Myanmar Digit Eight - "Ⴞ", # Georgian Letter Har - "Ⴟ", # Georgian Letter Hae - "Ⴠ", # Georgian Letter Hoe - "Ⴡ", # Georgian Letter Yu - "ᅁ", # Hangul Letter Yeorin Hieuh - "ᅂ", # Hangul Letter Yeorin Simeum - "ᅃ", # Hangul Letter Yeorin Cieuc - "ᇅ", # Hangul Letter Phieuph-Pieup - "ᇆ", # Hangul Letter Kapyeounphieuph - "ᇍ", # Hangul Letter Kapyeounhieuh - "ᇞ", # Hangul Letter Yang-Hieuh - "ᇟ", # Hangul Letter Yo-Yae - "ሸ", # Ethiopic Syllable Shee - "ሹ", # Ethiopic Syllable Shuu - "ሻ", # Ethiopic Syllable Shaa - "ሼ", # Ethiopic Syllable She - "ኩ", # Ethiopic Syllable Ku - "ᑋ", # Canadian Syllabics We - "ᑌ", # Canadian Syllabics West-Cree Pa - "ᗂ", # Canadian Syllabics South Slavey Lo - "ᗃ", # Canadian Syllabics South Slavey Lu - "ᗉ", # Canadian Syllabics Carrier Syllabic Yay - "ᗊ", # Canadian Syllabics Carrier Syllabic Yaa - "ᗋ", # Canadian Syllabics Carrier Syllabic Ywe - "ᗌ", # Canadian Syllabics Carrier Syllabic Ywi - "ᗍ", # Canadian Syllabics Carrier Syllabic Ywii - "ᗎ", # Canadian Syllabics Carrier Syllabic Ywo - "ᗏ", # Canadian Syllabics Carrier Syllabic Ywoo - "ᗐ", # Canadian Syllabics Carrier Syllabic Ywi - "ᗗ", # Canadian Syllabics Cree-Cha - "ᙁ", # Canadian Syllabics Slavey She - "ᙂ", # Canadian Syllabics Chipewyan Ga - "᥄", # Ethiopic Syllable Gwa - "᥆", # Ethiopic Syllable Gwo - "ᩂ", # Tai Tham Consonant Sign Low Ha - "ᩃ", # Tai Tham Consonant Sign High Ha - "᱁", # Ethiopic Syllable Hoa - "᱂", # Ethiopic Syllable Hoa - "ỷ", # Latin Small Letter Y With Tilde - "Ỹ", # Latin Capital Letter Y With Tilde - "橮", # CJK Unified Ideograph-6AEE - "橯", # CJK Unified Ideograph-6AEF - "櫵", # CJK Unified Ideograph-6AF5 - "欷", # CJK Unified Ideograph-6B37 - "欸", # CJK Unified Ideograph-6B38 - "歄", # CJK Unified Ideograph-6B84 - "溘", # CJK Unified Ideograph-6E98 - "벟", # Hangul Syllable Eq - "⳹", # Greek Small Letter Ous - "⳾", # Greek Small Letter Psi - "。", # Ideographic Full Stop - "︒", # Presentation Form For Vertical Ideographic Full Stop - "。", # Halfwidth Katakana Middle Dot - "𖫵", # Mongolian Vowel Separator - "𖺘", # Mongolian Letter Ali Gali U - "𛲟", # Hanifi Rohingya Sign Harbahay - "𝪈", # Mathematical Bold Capital U -] # 150 symbols +# unicode code points generated with Unicode::Tussle perl script: +# unichars -aBbs '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' | awk '$2="\""$2"\", #"' +# ruff: noqa: E501 +GLOBAL_SENTENCE_TERMINATORS = ( + [ + "!", # U+00021 BC=ON BLK=Basic_Latin SC=Common EXCLAMATION MARK + ".", # U+0002E BC=CS BLK=Basic_Latin SC=Common FULL STOP + "?", # U+0003F BC=ON BLK=Basic_Latin SC=Common QUESTION MARK + "։", # U+00589 BC=L BLK=Armenian SC=Armenian ARMENIAN FULL STOP + "؝", # U+0061D BC=AL BLK=Arabic SC=Arabic ARABIC END OF TEXT MARK + "؞", # U+0061E BC=AL BLK=Arabic SC=Arabic ARABIC TRIPLE DOT PUNCTUATION MARK + "؟", # U+0061F BC=AL BLK=Arabic SC=Common ARABIC QUESTION MARK + "۔", # U+006D4 BC=AL BLK=Arabic SC=Arabic ARABIC FULL STOP + "܀", # U+00700 BC=AL BLK=Syriac SC=Syriac SYRIAC END OF PARAGRAPH + "܁", # U+00701 BC=AL BLK=Syriac SC=Syriac SYRIAC SUPRALINEAR FULL STOP + "܂", # U+00702 BC=AL BLK=Syriac SC=Syriac SYRIAC SUBLINEAR FULL STOP + "߹", # U+007F9 BC=ON BLK=NKo SC=Nko NKO EXCLAMATION MARK + "࠷", # U+00837 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION MELODIC QITSA + "࠹", # U+00839 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION QITSA + "࠽", # U+0083D BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION SOF MASHFAAT + "࠾", # U+0083E BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION ANNAAU + "।", # U+00964 BC=L BLK=Devanagari SC=Common DEVANAGARI DANDA + "॥", # U+00965 BC=L BLK=Devanagari SC=Common DEVANAGARI DOUBLE DANDA + "၊", # U+0104A BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN LITTLE SECTION + "။", # U+0104B BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN SECTION + "።", # U+01362 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC FULL STOP + "፧", # U+01367 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC QUESTION MARK + "፨", # U+01368 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC PARAGRAPH SEPARATOR + "᙮", # U+0166E BC=L BLK=Unified_Canadian_Aboriginal_Syllabics SC=Canadian_Aboriginal CANADIAN SYLLABICS FULL STOP + "᜵", # U+01735 BC=L BLK=Hanunoo SC=Common PHILIPPINE SINGLE PUNCTUATION + "᜶", # U+01736 BC=L BLK=Hanunoo SC=Common PHILIPPINE DOUBLE PUNCTUATION + "᠃", # U+01803 BC=ON BLK=Mongolian SC=Common MONGOLIAN FULL STOP + "᠉", # U+01809 BC=ON BLK=Mongolian SC=Mongolian MONGOLIAN MANCHU FULL STOP + "᥄", # U+01944 BC=ON BLK=Limbu SC=Limbu LIMBU EXCLAMATION MARK + "᥅", # U+01945 BC=ON BLK=Limbu SC=Limbu LIMBU QUESTION MARK + "᪨", # U+01AA8 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAAN + "᪩", # U+01AA9 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAANKUU + "᪪", # U+01AAA BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAAN + "᪫", # U+01AAB BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAANKUU + "᭚", # U+01B5A BC=L BLK=Balinese SC=Balinese BALINESE PANTI + "᭛", # U+01B5B BC=L BLK=Balinese SC=Balinese BALINESE PAMADA + "᭞", # U+01B5E BC=L BLK=Balinese SC=Balinese BALINESE CARIK SIKI + "᭟", # U+01B5F BC=L BLK=Balinese SC=Balinese BALINESE CARIK PAREREN + "᭽", # U+01B7D BC=L BLK=Balinese SC=Balinese BALINESE PANTI LANTANG + "᭾", # U+01B7E BC=L BLK=Balinese SC=Balinese BALINESE PAMADA LANTANG + "᰻", # U+01C3B BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION TA-ROL + "᰼", # U+01C3C BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION NYET THYOOM TA-ROL + "᱾", # U+01C7E BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION MUCAAD + "᱿", # U+01C7F BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION DOUBLE MUCAAD + "․", # U+02024 BC=ON BLK=General_Punctuation SC=Common ONE DOT LEADER + "‼", # U+0203C BC=ON BLK=General_Punctuation SC=Common DOUBLE EXCLAMATION MARK + "‽", # U+0203D BC=ON BLK=General_Punctuation SC=Common INTERROBANG + "⁇", # U+02047 BC=ON BLK=General_Punctuation SC=Common DOUBLE QUESTION MARK + "⁈", # U+02048 BC=ON BLK=General_Punctuation SC=Common QUESTION EXCLAMATION MARK + "⁉", # U+02049 BC=ON BLK=General_Punctuation SC=Common EXCLAMATION QUESTION MARK + "⸮", # U+02E2E BC=ON BLK=Supplemental_Punctuation SC=Common REVERSED QUESTION MARK + "⸼", # U+02E3C BC=ON BLK=Supplemental_Punctuation SC=Common STENOGRAPHIC FULL STOP + "⹓", # U+02E53 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL EXCLAMATION MARK + "⹔", # U+02E54 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL QUESTION MARK + "꓿", # U+0A4FF BC=L BLK=Lisu SC=Lisu LISU PUNCTUATION FULL STOP + "꘎", # U+0A60E BC=ON BLK=Vai SC=Vai VAI FULL STOP + "꘏", # U+0A60F BC=ON BLK=Vai SC=Vai VAI QUESTION MARK + "꛳", # U+0A6F3 BC=L BLK=Bamum SC=Bamum BAMUM FULL STOP + "꛷", # U+0A6F7 BC=L BLK=Bamum SC=Bamum BAMUM QUESTION MARK + "꡶", # U+0A876 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK SHAD + "꡷", # U+0A877 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK DOUBLE SHAD + "꣎", # U+0A8CE BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DANDA + "꣏", # U+0A8CF BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DOUBLE DANDA + "꤯", # U+0A92F BC=L BLK=Kayah_Li SC=Kayah_Li KAYAH LI SIGN SHYA + "꧈", # U+0A9C8 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LINGSA + "꧉", # U+0A9C9 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LUNGSI + "꩝", # U+0AA5D BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DANDA + "꩞", # U+0AA5E BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DOUBLE DANDA + "꩟", # U+0AA5F BC=L BLK=Cham SC=Cham CHAM PUNCTUATION TRIPLE DANDA + "꫰", # U+0AAF0 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK CHEIKHAN + "꫱", # U+0AAF1 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK AHANG KHUDAM + "꯫", # U+0ABEB BC=L BLK=Meetei_Mayek SC=Meetei_Mayek MEETEI MAYEK CHEIKHEI + "﹒", # U+0FE52 BC=CS BLK=Small_Form_Variants SC=Common SMALL FULL STOP + "﹖", # U+0FE56 BC=ON BLK=Small_Form_Variants SC=Common SMALL QUESTION MARK + "﹗", # U+0FE57 BC=ON BLK=Small_Form_Variants SC=Common SMALL EXCLAMATION MARK + "!", # U+0FF01 BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH EXCLAMATION MARK + ".", # U+0FF0E BC=CS BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH FULL STOP + "?", # U+0FF1F BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH QUESTION MARK + "𐩖", # U+10A56 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DANDA + "𐩗", # U+10A57 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DOUBLE DANDA + "𐽕", # U+10F55 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS + "𐽖", # U+10F56 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS WITH DOTS + "𐽗", # U+10F57 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION CIRCLE WITH DOT + "𐽘", # U+10F58 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO CIRCLES WITH DOTS + "𐽙", # U+10F59 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT + "𐾆", # U+10F86 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION BAR + "𐾇", # U+10F87 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO BARS + "𐾈", # U+10F88 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO DOTS + "𐾉", # U+10F89 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION FOUR DOTS + "𑁇", # U+11047 BC=L BLK=Brahmi SC=Brahmi BRAHMI DANDA + "𑁈", # U+11048 BC=L BLK=Brahmi SC=Brahmi BRAHMI DOUBLE DANDA + "𑂾", # U+110BE BC=L BLK=Kaithi SC=Kaithi KAITHI SECTION MARK + "𑂿", # U+110BF BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE SECTION MARK + "𑃀", # U+110C0 BC=L BLK=Kaithi SC=Kaithi KAITHI DANDA + "𑃁", # U+110C1 BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE DANDA + "𑅁", # U+11141 BC=L BLK=Chakma SC=Chakma CHAKMA DANDA + "𑅂", # U+11142 BC=L BLK=Chakma SC=Chakma CHAKMA DOUBLE DANDA + "𑅃", # U+11143 BC=L BLK=Chakma SC=Chakma CHAKMA QUESTION MARK + "𑇅", # U+111C5 BC=L BLK=Sharada SC=Sharada SHARADA DANDA + "𑇆", # U+111C6 BC=L BLK=Sharada SC=Sharada SHARADA DOUBLE DANDA + "𑇍", # U+111CD BC=L BLK=Sharada SC=Sharada SHARADA SUTRA MARK + "𑇞", # U+111DE BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-1 + "𑇟", # U+111DF BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-2 + "𑈸", # U+11238 BC=L BLK=Khojki SC=Khojki KHOJKI DANDA + "𑈹", # U+11239 BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE DANDA + "𑈻", # U+1123B BC=L BLK=Khojki SC=Khojki KHOJKI SECTION MARK + "𑈼", # U+1123C BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE SECTION MARK + "𑊩", # U+112A9 BC=L BLK=Multani SC=Multani MULTANI SECTION MARK + "𑑋", # U+1144B BC=L BLK=Newa SC=Newa NEWA DANDA + "𑑌", # U+1144C BC=L BLK=Newa SC=Newa NEWA DOUBLE DANDA + "𑗂", # U+115C2 BC=L BLK=Siddham SC=Siddham SIDDHAM DANDA + "𑗃", # U+115C3 BC=L BLK=Siddham SC=Siddham SIDDHAM DOUBLE DANDA + "𑗉", # U+115C9 BC=L BLK=Siddham SC=Siddham SIDDHAM END OF TEXT MARK + "𑗊", # U+115CA BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND U-SHAPED ORNAMENTS + "𑗋", # U+115CB BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND DOTTED CRESCENTS + "𑗌", # U+115CC BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED CRESCENTS + "𑗍", # U+115CD BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED DOUBLE CRESCENTS + "𑗎", # U+115CE BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED TRIPLE CRESCENTS + "𑗏", # U+115CF BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING + "𑗐", # U+115D0 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING WITH RAYS + "𑗑", # U+115D1 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH DOUBLE CRESCENTS + "𑗒", # U+115D2 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIPLE CRESCENTS + "𑗓", # U+115D3 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH QUADRUPLE CRESCENTS + "𑗔", # U+115D4 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH SEPTUPLE CRESCENTS + "𑗕", # U+115D5 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND RAYS + "𑗖", # U+115D6 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND TWO ENCLOSURES + "𑗗", # U+115D7 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES + "𑙁", # U+11641 BC=L BLK=Modi SC=Modi MODI DANDA + "𑙂", # U+11642 BC=L BLK=Modi SC=Modi MODI DOUBLE DANDA + "𑜼", # U+1173C BC=L BLK=Ahom SC=Ahom AHOM SIGN SMALL SECTION + "𑜽", # U+1173D BC=L BLK=Ahom SC=Ahom AHOM SIGN SECTION + "𑜾", # U+1173E BC=L BLK=Ahom SC=Ahom AHOM SIGN RULAI + "𑥄", # U+11944 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU DOUBLE DANDA + "𑥆", # U+11946 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU END OF TEXT MARK + "𑩂", # U+11A42 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK SHAD + "𑩃", # U+11A43 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK DOUBLE SHAD + "𑪛", # U+11A9B BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK SHAD + "𑪜", # U+11A9C BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK DOUBLE SHAD + "𑱁", # U+11C41 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DANDA + "𑱂", # U+11C42 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DOUBLE DANDA + "𑻷", # U+11EF7 BC=L BLK=Makasar SC=Makasar MAKASAR PASSIMBANG + "𑻸", # U+11EF8 BC=L BLK=Makasar SC=Makasar MAKASAR END OF SECTION + "𑽃", # U+11F43 BC=L BLK=Kawi SC=Kawi KAWI DANDA + "𑽄", # U+11F44 BC=L BLK=Kawi SC=Kawi KAWI DOUBLE DANDA + "𖩮", # U+16A6E BC=L BLK=Mro SC=Mro MRO DANDA + "𖩯", # U+16A6F BC=L BLK=Mro SC=Mro MRO DOUBLE DANDA + "𖫵", # U+16AF5 BC=L BLK=Bassa_Vah SC=Bassa_Vah BASSA VAH FULL STOP + "𖬷", # U+16B37 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS THOM + "𖬸", # U+16B38 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS TSHAB CEEB + "𖭄", # U+16B44 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN XAUS + "𖺘", # U+16E98 BC=L BLK=Medefaidrin SC=Medefaidrin MEDEFAIDRIN FULL STOP + "𛲟", # U+1BC9F BC=L BLK=Duployan SC=Duployan DUPLOYAN PUNCTUATION CHINOOK FULL STOP + "𝪈", # U+1DA88 BC=L BLK=Sutton_SignWriting SC=SignWriting SIGNWRITING FULL STOP + ] + + [ + # Additional manual entries. + "...", # U+2026 HORIZONTAL ELLIPSIS + "。", # U+3002 IDEOGRAPHIC FULL STOP + ] +) diff --git a/test/unit/test_am.py b/test/unit/test_am.py index 4de975e..d116edb 100644 --- a/test/unit/test_am.py +++ b/test/unit/test_am.py @@ -9,7 +9,7 @@ ), ( "ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።", - ["ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻ", "ርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።"], + ["ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።"], ), ]