Skip to content

Commit

Permalink
Regenerate terminators, fix Amharic text
Browse files Browse the repository at this point in the history
Regenerate terminators using Unicode::Tussle

Fixes issue #3
  • Loading branch information
santhoshtr committed Nov 1, 2023
1 parent 5a0a286 commit 0529670
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 143 deletions.
304 changes: 162 additions & 142 deletions sentencex/terminators.py
Original file line number Diff line number Diff line change
@@ -1,143 +1,163 @@
# unicode code points with the \p{Sentence_Break=STerm} or \p{Sentence_Break=ATerm} properties that
# also have the \p{Terminal_Punctuation} property generated with Unicode::Tussle perl script and
# additional fullstops in unicode character sets : https://www.fileformat.info/info/unicode/char/search.htm?q=.&
# preview=entity
# unicode code points generated with Unicode::Tussle perl script:
# unichars -aBbs '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' | awk '$2="\""$2"\", #"'
# ruff: noqa: E501
GLOBAL_SENTENCE_TERMINATORS = [
"...", # Horizontal Ellipsis
"!", # Exclamation Mark
".", # Full Stop
"?", # Question Mark
"։", # Armenian Full Stop
"؞", # Arabic Sign Sallallahou Alayhe Wasallam
"؟", # Arabic Question Mark
"۔", # Arabic Full Stop
"܀", # Syriac End of Paragraph
"܁", # Syriac Supralinear Colon
"܂", # Syriac Sublinear Colon
"߹", # Nko Symbol Doorye
"࠷", # Samarkan Letter Do
"࠹", # Samarkan Letter Jho
"࠽", # Samarkan Letter Ro
"࠾", # Samarkan Letter Lo
"।", # Devanagari Danda
"॥", # Devanagari Double Danda
"၊", # Myanmar Sign Myanmar Phrase Stop
"။", # Myanmar Sign Myanmar Paragraph
"።", # Ethiopic Full Stop
"፧", # Ethiopic Colon
"፨", # Ethiopic Preface Colon
"᙮", # Ethiopic Question Mark
"᜵", # Buginese Vowel Sign E
"᜶", # Buginese Vowel Sign O
"᠃", # Mongolian Full Stop
"᠉", # Mongolian Birga
"᥄", # Buhid Virama
"᥅", # Buhid Punctuation Mark
"᪨", # Tai Tham Consonant Sign Medial Ra
"᪩", # Tai Tham Consonant Sign Medial La
"᪪", # Tai Tham Consonant Sign La Taa
"᪫", # Tai Tham Sign Mai Sak
"᭚", # Balinese Pameneng
"᭛", # Balinese Musical Symbol Combining Jublag
"᭞", # Sundanese Padasan Agung
"᭟", # Sundanese Paneken
"᰻", # Buhid Pamudpod
"᰼", # Buhid Pamudpod Han
"᱾", # Limbu Question Mark
"᱿", # Limbu Exclamation Mark
"‼", # Double Exclamation Mark
"‽", # Interrobang
"⁇", # Double Question Mark
"⁈", # Question Exclamation Mark
"⁉", # Exclamation Question Mark
"⸮", # Reversed Question Mark
"⸼", # Armenian Parenthesis Right
"꓿", # Yi Punctuation Small Comma
"꘎", # Vai Comma
"꘏", # Vai Full Stop
"꛳", # Batak Apostrophe
"꛷", # Batak Pangolat
"꡶", # Lanna Punctation Phrase
"꡷", # Lanna Punctation Paragraph
"꣎", # Ol Chiki Punctuation Mucaad
"꣏", # Ol Chiki Punctuation Double
"꤯", # Chakma Sign Visarga
"꧈", # Balinese Musical Symbol Left-Hand Open Dug
"꧉", # Balinese Musical Symbol Right-Hand Open Dug
"꩝", # Cham Consonant Sign Final H
"꩞", # Cham Consonant Sign Glottal Stop
"꩟", # Cham Consonant Sign M
"꫰", # Tai Viet Mai Khit
"꫱", # Tai Viet Vowel Ia
"꯫", # Meetei Mayek Cheikhei
"﹒", # Small Full Stop
"﹖", # Small Question Mark
"﹗", # Small Exclamation Mark
"!", # Fullwidth Exclamation Mark
".", # Fullwidth Full Stop
"?", # Fullwidth Question Mark
"ཕ", # Tibetan Letter Pha
"བ", # Tibetan Letter Ba
"བྷ", # Tibetan Letter Bha
"མ", # Tibetan Letter Ma
"ཙ", # Tibetan Letter Tsa
"၇", # Myanmar Digit Seven
"၈", # Myanmar Digit Eight
"Ⴞ", # Georgian Letter Har
"Ⴟ", # Georgian Letter Hae
"Ⴠ", # Georgian Letter Hoe
"Ⴡ", # Georgian Letter Yu
"ᅁ", # Hangul Letter Yeorin Hieuh
"ᅂ", # Hangul Letter Yeorin Simeum
"ᅃ", # Hangul Letter Yeorin Cieuc
"ᇅ", # Hangul Letter Phieuph-Pieup
"ᇆ", # Hangul Letter Kapyeounphieuph
"ᇍ", # Hangul Letter Kapyeounhieuh
"ᇞ", # Hangul Letter Yang-Hieuh
"ᇟ", # Hangul Letter Yo-Yae
"ሸ", # Ethiopic Syllable Shee
"ሹ", # Ethiopic Syllable Shuu
"ሻ", # Ethiopic Syllable Shaa
"ሼ", # Ethiopic Syllable She
"ኩ", # Ethiopic Syllable Ku
"ᑋ", # Canadian Syllabics We
"ᑌ", # Canadian Syllabics West-Cree Pa
"ᗂ", # Canadian Syllabics South Slavey Lo
"ᗃ", # Canadian Syllabics South Slavey Lu
"ᗉ", # Canadian Syllabics Carrier Syllabic Yay
"ᗊ", # Canadian Syllabics Carrier Syllabic Yaa
"ᗋ", # Canadian Syllabics Carrier Syllabic Ywe
"ᗌ", # Canadian Syllabics Carrier Syllabic Ywi
"ᗍ", # Canadian Syllabics Carrier Syllabic Ywii
"ᗎ", # Canadian Syllabics Carrier Syllabic Ywo
"ᗏ", # Canadian Syllabics Carrier Syllabic Ywoo
"ᗐ", # Canadian Syllabics Carrier Syllabic Ywi
"ᗗ", # Canadian Syllabics Cree-Cha
"ᙁ", # Canadian Syllabics Slavey She
"ᙂ", # Canadian Syllabics Chipewyan Ga
"᥄", # Ethiopic Syllable Gwa
"᥆", # Ethiopic Syllable Gwo
"ᩂ", # Tai Tham Consonant Sign Low Ha
"ᩃ", # Tai Tham Consonant Sign High Ha
"᱁", # Ethiopic Syllable Hoa
"᱂", # Ethiopic Syllable Hoa
"ỷ", # Latin Small Letter Y With Tilde
"Ỹ", # Latin Capital Letter Y With Tilde
"橮", # CJK Unified Ideograph-6AEE
"橯", # CJK Unified Ideograph-6AEF
"櫵", # CJK Unified Ideograph-6AF5
"欷", # CJK Unified Ideograph-6B37
"欸", # CJK Unified Ideograph-6B38
"歄", # CJK Unified Ideograph-6B84
"溘", # CJK Unified Ideograph-6E98
"벟", # Hangul Syllable Eq
"⳹", # Greek Small Letter Ous
"⳾", # Greek Small Letter Psi
"。", # Ideographic Full Stop
"︒", # Presentation Form For Vertical Ideographic Full Stop
"。", # Halfwidth Katakana Middle Dot
"𖫵", # Mongolian Vowel Separator
"𖺘", # Mongolian Letter Ali Gali U
"𛲟", # Hanifi Rohingya Sign Harbahay
"𝪈", # Mathematical Bold Capital U
] # 150 symbols
"!", # U+00021 BC=ON BLK=Basic_Latin SC=Common EXCLAMATION MARK
".", # U+0002E BC=CS BLK=Basic_Latin SC=Common FULL STOP
"?", # U+0003F BC=ON BLK=Basic_Latin SC=Common QUESTION MARK
"։", # U+00589 BC=L BLK=Armenian SC=Armenian ARMENIAN FULL STOP
"؝", # U+0061D BC=AL BLK=Arabic SC=Arabic ARABIC END OF TEXT MARK
"؞", # U+0061E BC=AL BLK=Arabic SC=Arabic ARABIC TRIPLE DOT PUNCTUATION MARK
"؟", # U+0061F BC=AL BLK=Arabic SC=Common ARABIC QUESTION MARK
"۔", # U+006D4 BC=AL BLK=Arabic SC=Arabic ARABIC FULL STOP
"܀", # U+00700 BC=AL BLK=Syriac SC=Syriac SYRIAC END OF PARAGRAPH
"܁", # U+00701 BC=AL BLK=Syriac SC=Syriac SYRIAC SUPRALINEAR FULL STOP
"܂", # U+00702 BC=AL BLK=Syriac SC=Syriac SYRIAC SUBLINEAR FULL STOP
"߹", # U+007F9 BC=ON BLK=NKo SC=Nko NKO EXCLAMATION MARK
"࠷", # U+00837 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION MELODIC QITSA
"࠹", # U+00839 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION QITSA
"࠽", # U+0083D BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION SOF MASHFAAT
"࠾", # U+0083E BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION ANNAAU
"।", # U+00964 BC=L BLK=Devanagari SC=Common DEVANAGARI DANDA
"॥", # U+00965 BC=L BLK=Devanagari SC=Common DEVANAGARI DOUBLE DANDA
"၊", # U+0104A BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN LITTLE SECTION
"။", # U+0104B BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN SECTION
"።", # U+01362 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC FULL STOP
"፧", # U+01367 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC QUESTION MARK
"፨", # U+01368 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC PARAGRAPH SEPARATOR
"᙮", # U+0166E BC=L BLK=Unified_Canadian_Aboriginal_Syllabics SC=Canadian_Aboriginal CANADIAN SYLLABICS FULL STOP
"᜵", # U+01735 BC=L BLK=Hanunoo SC=Common PHILIPPINE SINGLE PUNCTUATION
"᜶", # U+01736 BC=L BLK=Hanunoo SC=Common PHILIPPINE DOUBLE PUNCTUATION
"᠃", # U+01803 BC=ON BLK=Mongolian SC=Common MONGOLIAN FULL STOP
"᠉", # U+01809 BC=ON BLK=Mongolian SC=Mongolian MONGOLIAN MANCHU FULL STOP
"᥄", # U+01944 BC=ON BLK=Limbu SC=Limbu LIMBU EXCLAMATION MARK
"᥅", # U+01945 BC=ON BLK=Limbu SC=Limbu LIMBU QUESTION MARK
"᪨", # U+01AA8 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAAN
"᪩", # U+01AA9 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAANKUU
"᪪", # U+01AAA BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAAN
"᪫", # U+01AAB BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAANKUU
"᭚", # U+01B5A BC=L BLK=Balinese SC=Balinese BALINESE PANTI
"᭛", # U+01B5B BC=L BLK=Balinese SC=Balinese BALINESE PAMADA
"᭞", # U+01B5E BC=L BLK=Balinese SC=Balinese BALINESE CARIK SIKI
"᭟", # U+01B5F BC=L BLK=Balinese SC=Balinese BALINESE CARIK PAREREN
"᭽", # U+01B7D BC=L BLK=Balinese SC=Balinese BALINESE PANTI LANTANG
"᭾", # U+01B7E BC=L BLK=Balinese SC=Balinese BALINESE PAMADA LANTANG
"᰻", # U+01C3B BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION TA-ROL
"᰼", # U+01C3C BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION NYET THYOOM TA-ROL
"᱾", # U+01C7E BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION MUCAAD
"᱿", # U+01C7F BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION DOUBLE MUCAAD
"․", # U+02024 BC=ON BLK=General_Punctuation SC=Common ONE DOT LEADER
"‼", # U+0203C BC=ON BLK=General_Punctuation SC=Common DOUBLE EXCLAMATION MARK
"‽", # U+0203D BC=ON BLK=General_Punctuation SC=Common INTERROBANG
"⁇", # U+02047 BC=ON BLK=General_Punctuation SC=Common DOUBLE QUESTION MARK
"⁈", # U+02048 BC=ON BLK=General_Punctuation SC=Common QUESTION EXCLAMATION MARK
"⁉", # U+02049 BC=ON BLK=General_Punctuation SC=Common EXCLAMATION QUESTION MARK
"⸮", # U+02E2E BC=ON BLK=Supplemental_Punctuation SC=Common REVERSED QUESTION MARK
"⸼", # U+02E3C BC=ON BLK=Supplemental_Punctuation SC=Common STENOGRAPHIC FULL STOP
"⹓", # U+02E53 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL EXCLAMATION MARK
"⹔", # U+02E54 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL QUESTION MARK
"꓿", # U+0A4FF BC=L BLK=Lisu SC=Lisu LISU PUNCTUATION FULL STOP
"꘎", # U+0A60E BC=ON BLK=Vai SC=Vai VAI FULL STOP
"꘏", # U+0A60F BC=ON BLK=Vai SC=Vai VAI QUESTION MARK
"꛳", # U+0A6F3 BC=L BLK=Bamum SC=Bamum BAMUM FULL STOP
"꛷", # U+0A6F7 BC=L BLK=Bamum SC=Bamum BAMUM QUESTION MARK
"꡶", # U+0A876 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK SHAD
"꡷", # U+0A877 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK DOUBLE SHAD
"꣎", # U+0A8CE BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DANDA
"꣏", # U+0A8CF BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DOUBLE DANDA
"꤯", # U+0A92F BC=L BLK=Kayah_Li SC=Kayah_Li KAYAH LI SIGN SHYA
"꧈", # U+0A9C8 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LINGSA
"꧉", # U+0A9C9 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LUNGSI
"꩝", # U+0AA5D BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DANDA
"꩞", # U+0AA5E BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DOUBLE DANDA
"꩟", # U+0AA5F BC=L BLK=Cham SC=Cham CHAM PUNCTUATION TRIPLE DANDA
"꫰", # U+0AAF0 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK CHEIKHAN
"꫱", # U+0AAF1 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK AHANG KHUDAM
"꯫", # U+0ABEB BC=L BLK=Meetei_Mayek SC=Meetei_Mayek MEETEI MAYEK CHEIKHEI
"﹒", # U+0FE52 BC=CS BLK=Small_Form_Variants SC=Common SMALL FULL STOP
"﹖", # U+0FE56 BC=ON BLK=Small_Form_Variants SC=Common SMALL QUESTION MARK
"﹗", # U+0FE57 BC=ON BLK=Small_Form_Variants SC=Common SMALL EXCLAMATION MARK
"!", # U+0FF01 BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH EXCLAMATION MARK
".", # U+0FF0E BC=CS BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH FULL STOP
"?", # U+0FF1F BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH QUESTION MARK
"𐩖", # U+10A56 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DANDA
"𐩗", # U+10A57 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DOUBLE DANDA
"𐽕", # U+10F55 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS
"𐽖", # U+10F56 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS WITH DOTS
"𐽗", # U+10F57 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION CIRCLE WITH DOT
"𐽘", # U+10F58 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO CIRCLES WITH DOTS
"𐽙", # U+10F59 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
"𐾆", # U+10F86 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION BAR
"𐾇", # U+10F87 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO BARS
"𐾈", # U+10F88 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO DOTS
"𐾉", # U+10F89 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION FOUR DOTS
"𑁇", # U+11047 BC=L BLK=Brahmi SC=Brahmi BRAHMI DANDA
"𑁈", # U+11048 BC=L BLK=Brahmi SC=Brahmi BRAHMI DOUBLE DANDA
"𑂾", # U+110BE BC=L BLK=Kaithi SC=Kaithi KAITHI SECTION MARK
"𑂿", # U+110BF BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE SECTION MARK
"𑃀", # U+110C0 BC=L BLK=Kaithi SC=Kaithi KAITHI DANDA
"𑃁", # U+110C1 BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE DANDA
"𑅁", # U+11141 BC=L BLK=Chakma SC=Chakma CHAKMA DANDA
"𑅂", # U+11142 BC=L BLK=Chakma SC=Chakma CHAKMA DOUBLE DANDA
"𑅃", # U+11143 BC=L BLK=Chakma SC=Chakma CHAKMA QUESTION MARK
"𑇅", # U+111C5 BC=L BLK=Sharada SC=Sharada SHARADA DANDA
"𑇆", # U+111C6 BC=L BLK=Sharada SC=Sharada SHARADA DOUBLE DANDA
"𑇍", # U+111CD BC=L BLK=Sharada SC=Sharada SHARADA SUTRA MARK
"𑇞", # U+111DE BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-1
"𑇟", # U+111DF BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-2
"𑈸", # U+11238 BC=L BLK=Khojki SC=Khojki KHOJKI DANDA
"𑈹", # U+11239 BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE DANDA
"𑈻", # U+1123B BC=L BLK=Khojki SC=Khojki KHOJKI SECTION MARK
"𑈼", # U+1123C BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE SECTION MARK
"𑊩", # U+112A9 BC=L BLK=Multani SC=Multani MULTANI SECTION MARK
"𑑋", # U+1144B BC=L BLK=Newa SC=Newa NEWA DANDA
"𑑌", # U+1144C BC=L BLK=Newa SC=Newa NEWA DOUBLE DANDA
"𑗂", # U+115C2 BC=L BLK=Siddham SC=Siddham SIDDHAM DANDA
"𑗃", # U+115C3 BC=L BLK=Siddham SC=Siddham SIDDHAM DOUBLE DANDA
"𑗉", # U+115C9 BC=L BLK=Siddham SC=Siddham SIDDHAM END OF TEXT MARK
"𑗊", # U+115CA BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND U-SHAPED ORNAMENTS
"𑗋", # U+115CB BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND DOTTED CRESCENTS
"𑗌", # U+115CC BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED CRESCENTS
"𑗍", # U+115CD BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED DOUBLE CRESCENTS
"𑗎", # U+115CE BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED TRIPLE CRESCENTS
"𑗏", # U+115CF BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING
"𑗐", # U+115D0 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING WITH RAYS
"𑗑", # U+115D1 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH DOUBLE CRESCENTS
"𑗒", # U+115D2 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIPLE CRESCENTS
"𑗓", # U+115D3 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH QUADRUPLE CRESCENTS
"𑗔", # U+115D4 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH SEPTUPLE CRESCENTS
"𑗕", # U+115D5 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND RAYS
"𑗖", # U+115D6 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND TWO ENCLOSURES
"𑗗", # U+115D7 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
"𑙁", # U+11641 BC=L BLK=Modi SC=Modi MODI DANDA
"𑙂", # U+11642 BC=L BLK=Modi SC=Modi MODI DOUBLE DANDA
"𑜼", # U+1173C BC=L BLK=Ahom SC=Ahom AHOM SIGN SMALL SECTION
"𑜽", # U+1173D BC=L BLK=Ahom SC=Ahom AHOM SIGN SECTION
"𑜾", # U+1173E BC=L BLK=Ahom SC=Ahom AHOM SIGN RULAI
"𑥄", # U+11944 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU DOUBLE DANDA
"𑥆", # U+11946 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU END OF TEXT MARK
"𑩂", # U+11A42 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK SHAD
"𑩃", # U+11A43 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK DOUBLE SHAD
"𑪛", # U+11A9B BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK SHAD
"𑪜", # U+11A9C BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK DOUBLE SHAD
"𑱁", # U+11C41 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DANDA
"𑱂", # U+11C42 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DOUBLE DANDA
"𑻷", # U+11EF7 BC=L BLK=Makasar SC=Makasar MAKASAR PASSIMBANG
"𑻸", # U+11EF8 BC=L BLK=Makasar SC=Makasar MAKASAR END OF SECTION
"𑽃", # U+11F43 BC=L BLK=Kawi SC=Kawi KAWI DANDA
"𑽄", # U+11F44 BC=L BLK=Kawi SC=Kawi KAWI DOUBLE DANDA
"𖩮", # U+16A6E BC=L BLK=Mro SC=Mro MRO DANDA
"𖩯", # U+16A6F BC=L BLK=Mro SC=Mro MRO DOUBLE DANDA
"𖫵", # U+16AF5 BC=L BLK=Bassa_Vah SC=Bassa_Vah BASSA VAH FULL STOP
"𖬷", # U+16B37 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS THOM
"𖬸", # U+16B38 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS TSHAB CEEB
"𖭄", # U+16B44 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN XAUS
"𖺘", # U+16E98 BC=L BLK=Medefaidrin SC=Medefaidrin MEDEFAIDRIN FULL STOP
"𛲟", # U+1BC9F BC=L BLK=Duployan SC=Duployan DUPLOYAN PUNCTUATION CHINOOK FULL STOP
"𝪈", # U+1DA88 BC=L BLK=Sutton_SignWriting SC=SignWriting SIGNWRITING FULL STOP
] + [
# Additional manual entries.
"...", # U+2026 HORIZONTAL ELLIPSIS
"。", # U+3002 IDEOGRAPHIC FULL STOP
]

2 changes: 1 addition & 1 deletion test/unit/test_am.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
),
(
"ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።",
["ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻ", "ርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።"],
["ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።"],
),
]

Expand Down

0 comments on commit 0529670

Please sign in to comment.