diff --git a/sentencex/languages/hy.py b/sentencex/languages/hy.py index 2fc108b..30c5829 100644 --- a/sentencex/languages/hy.py +++ b/sentencex/languages/hy.py @@ -9,5 +9,4 @@ class Armenian(Language): hy_terminators = GLOBAL_SENTENCE_TERMINATORS + ["։", "՜", ":"] hy_terminators.remove(".") - hy_terminators.remove("...") sentence_break_regex = re.compile(r"[%s]+" % "".join(hy_terminators)) diff --git a/sentencex/terminators.py b/sentencex/terminators.py index a54911d..3b0ecae 100644 --- a/sentencex/terminators.py +++ b/sentencex/terminators.py @@ -1,5 +1,6 @@ # unicode code points generated with Unicode::Tussle perl script: # unichars -aBbs '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' | awk '$2="\""$2"\", #"' +# Refer: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/SentenceBreakProperty.txt # ruff: noqa: E501 GLOBAL_SENTENCE_TERMINATORS = ( [ @@ -159,7 +160,7 @@ ] + [ # Additional manual entries. - "...", # U+2026 HORIZONTAL ELLIPSIS "。", # U+3002 IDEOGRAPHIC FULL STOP + "。", # U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP ] )