From 9ff7526c6cf08f3dea7a4b23a2281d871db4a4ab Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Tue, 14 Nov 2023 12:13:55 +0530 Subject: [PATCH] terminators: remove ellipses terminator, add halfwidth ideographic fullstop Fixes issue #11 --- sentencex/languages/hy.py | 1 - sentencex/terminators.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sentencex/languages/hy.py b/sentencex/languages/hy.py index 2fc108b..30c5829 100644 --- a/sentencex/languages/hy.py +++ b/sentencex/languages/hy.py @@ -9,5 +9,4 @@ class Armenian(Language): hy_terminators = GLOBAL_SENTENCE_TERMINATORS + ["։", "՜", ":"] hy_terminators.remove(".") - hy_terminators.remove("...") sentence_break_regex = re.compile(r"[%s]+" % "".join(hy_terminators)) diff --git a/sentencex/terminators.py b/sentencex/terminators.py index a54911d..3b0ecae 100644 --- a/sentencex/terminators.py +++ b/sentencex/terminators.py @@ -1,5 +1,6 @@ # unicode code points generated with Unicode::Tussle perl script: # unichars -aBbs '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' | awk '$2="\""$2"\", #"' +# Refer: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/SentenceBreakProperty.txt # ruff: noqa: E501 GLOBAL_SENTENCE_TERMINATORS = ( [ @@ -159,7 +160,7 @@ ] + [ # Additional manual entries. - "...", # U+2026 HORIZONTAL ELLIPSIS "。", # U+3002 IDEOGRAPHIC FULL STOP + "。", # U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP ] )