-
Notifications
You must be signed in to change notification settings - Fork 85
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Based on the Slovak language that is very close to Czech one I have created initial support for Czech language sentence splitting
- Loading branch information
Michal Bida
committed
Sep 30, 2024
1 parent
5905f13
commit aa9805e
Showing
4 changed files
with
145 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# -*- coding: utf-8 -*- | ||
import re | ||
from pysbd.abbreviation_replacer import AbbreviationReplacer | ||
from pysbd.between_punctuation import BetweenPunctuation | ||
from pysbd.lang.common import Common, Standard | ||
from pysbd.processor import Processor | ||
from pysbd.utils import Text | ||
from pysbd.punctuation_replacer import replace_punctuation | ||
from pysbd.lists_item_replacer import ListItemReplacer | ||
|
||
|
||
class Czech(Common, Standard): | ||
""" | ||
Based on slovak.py language as the languages are very close. | ||
""" | ||
|
||
iso_code = 'cz' | ||
|
||
class ListItemReplacer(ListItemReplacer): | ||
|
||
def add_line_break(self): | ||
# We've found alphabetical lists are causing a lot of problems with abbreviations | ||
# with multiple periods and spaces, such as 'Company name s. r. o.'. Disabling | ||
# alphabetical list parsing seems like a reasonable tradeoff. | ||
|
||
# self.format_alphabetical_lists() | ||
self.format_roman_numeral_lists() | ||
self.format_numbered_list_with_periods() | ||
self.format_numbered_list_with_parens() | ||
return self.text | ||
|
||
class AbbreviationReplacer(AbbreviationReplacer): | ||
SENTENCE_STARTERS = [] | ||
|
||
def replace_period_of_abbr(self, txt, abbr): | ||
# This is a very simple version of the original function, which makes sure | ||
# all of the periods in the abbreviation get replaced, not only the last one. | ||
# In Czech language we use a lot of abbreviations like 'Company Name s. r. o.', so it | ||
# is important to handle this properly. | ||
|
||
abbr_new = abbr.replace(".", "∯") + "∯" | ||
txt = txt.replace(abbr + ".", abbr_new) | ||
return txt | ||
|
||
class Abbreviation(Standard.Abbreviation): | ||
ABBREVIATIONS = ['č', 'no', 'nr', 's. r. o', 'ing', 'p', 'a. d', 'o. k', 'pol. pr', 'a. s. a. p', 'p. n. l', 'red', 'o.k', 'a.d', 'm.o', 'pol.pr', 'a.s.a.p', 'p.n.l', 'pp', 'sl', 'corp', 'plgr', 'tz', 'rtg', 'o.c.p', 'o. c. p', 'c.k', 'c. k', 'n.a', 'n. a', 'a.m', 'a. m', 'vz', 'i.b', 'i. b', 'ú.p.v.o', 'ú. p. v. o', 'bros', 'rsdr', 'doc', 'tu', 'ods', 'n.w.a', 'n. w. a', 'nár', 'pedg', 'paeddr', 'rndr', 'naprk', 'napřk', 'a.g.p', 'a. g. p', 'prof', 'pr', 'př', 'a.v', 'a. v', 'por', 'mvdr', 'nešp', 'u.s', 'u. s', 'kt', 'vyd', 'e.t', 'e. t', 'al', 'll.m', 'll. m', 'o.f.i', 'o. f. i', 'mr', 'apod', 'súkr', 'střed', 's.e.g', 's. e. g', 'sr', 'tvz', 'ind', 'var', 'etc', 'atd', 'n.o', 'n. o', 's.a', 's. a', 'např', 'a.i.i', 'a. i. i', 'a.k.a', 'a. k. a', 'konkr', 'čsl', 'odd', 'ltd', 't.z', 't. z', 'o.z', 'o. z', 'obv', 'obr', 'pok', 'tel', 'št', 'skr', 'phdr', 'xx', 'š.p', 'š. p', 'ph.d', 'ph. d', 'm.n.m', 'm. n. m', 'zz', 'roz', 'atď.', 'ev', 'v.sp', 'v. sp', 'drsc', 'mudr', 't.č', 't. č', 'el', 'os', 'co', 'r.o', 'r. o', 'str', 'p.a', 'p. a', 'zdravot', 'prek', 'gen', 'viď', 'dr', 'cca', 'p.s', 'p. s', 'zák', 'slov', 'arm', 'inc', 'max', 'd.c', 'k.o', 'a. r. k', 'd. c', 'k. o', 'a. r. k', 'soc', 'bc', 'zs', 'akad', 'sz', 'pozn', 'tr', 'nám', 'kol', 'csc', 'ul', 'sp', 'o.i', 'jr', 'zb', 'sv', 'tj', 'čs', 'tzn', 'příp', 'iv', 'hl', 'st', 'pod', 'vi', 'tis', 'stor', 'rozh', 'mld', 'atď', 'mgr', 'a.s', 'a. s', 'phd', 'z.z', 'z. z', 'judr', 'ing', 'hod', 'vs', 'písm', 's.r.o', 'min', 'ml', 'iii', 't.j', 't. j', 'spol', 'mil', 'ii', 'napr', 'resp', 'tzv'] | ||
PREPOSITIVE_ABBREVIATIONS = ['st', 'p', 'dr', 'mudr', 'judr', 'ing', 'mgr', 'bc', 'drsc', 'doc', 'prof'] | ||
NUMBER_ABBREVIATIONS = ['č', 'no', 'nr'] | ||
|
||
class BetweenPunctuation(BetweenPunctuation): | ||
# Rubular: https://rubular.com/r/rImWbaYFtHHtf4 | ||
BETWEEN_CZECH_DOUBLE_QUOTES_REGEX = r'„(?>[^“\\]+|\\{2}|\\.)*“' | ||
BETWEEN_CZECH_DOUBLE_QUOTES_REGEX_2 = r'\„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)\“' | ||
|
||
def sub_punctuation_between_czech_double_quotes(self, txt): | ||
return re.sub(self.BETWEEN_CZECH_DOUBLE_QUOTES_REGEX_2, replace_punctuation, txt) | ||
|
||
def sub_punctuation_between_quotes_and_parens(self, txt): | ||
txt = self.sub_punctuation_between_single_quotes(txt) | ||
txt = self.sub_punctuation_between_single_quote_slanted(txt) | ||
txt = self.sub_punctuation_between_double_quotes(txt) | ||
txt = self.sub_punctuation_between_square_brackets(txt) | ||
txt = self.sub_punctuation_between_parens(txt) | ||
txt = self.sub_punctuation_between_quotes_arrow(txt) | ||
txt = self.sub_punctuation_between_em_dashes(txt) | ||
txt = self.sub_punctuation_between_quotes_slanted(txt) | ||
txt = self.sub_punctuation_between_czech_double_quotes(txt) | ||
return txt | ||
|
||
class Processor(Processor): | ||
|
||
def __init__(self, text, lang, char_span=False): | ||
super().__init__(text, lang, char_span) | ||
|
||
def process(self): | ||
if not self.text: | ||
return self.text | ||
self.text = self.text.replace('\n', '\r') | ||
|
||
# Here we use language specific ListItemReplacer: | ||
li = self.lang.ListItemReplacer(self.text) | ||
self.text = li.add_line_break() | ||
|
||
self.replace_abbreviations() | ||
self.replace_numbers() | ||
self.replace_continuous_punctuation() | ||
self.replace_periods_before_numeric_references() | ||
self.text = Text(self.text).apply( | ||
self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule, | ||
self.lang.GeoLocationRule, self.lang.FileFormatRule) | ||
postprocessed_sents = self.split_into_segments() | ||
return postprocessed_sents | ||
|
||
def replace_numbers(self): | ||
self.text = Text(self.text).apply(*self.lang.Numbers.All) | ||
self.replace_period_in_czech_dates() | ||
self.replace_period_in_ordinal_numerals() | ||
self.replace_period_in_roman_numerals() | ||
return self.text | ||
|
||
def replace_period_in_ordinal_numerals(self): | ||
# Rubular: https://rubular.com/r/0HkmvzMGTqgWs6 | ||
self.text = re.sub(r'(?<=\d)\.(?=\s*[a-z]+)', '∯', self.text) | ||
|
||
def replace_period_in_roman_numerals(self): | ||
# Rubular: https://rubular.com/r/XlzTIi7aBRThSl | ||
self.text = re.sub(r'((\s+[VXI]+)|(^[VXI]+))(\.)(?=\s+)', r'\1∯', self.text, re.IGNORECASE) | ||
|
||
def replace_period_in_czech_dates(self): | ||
MONTHS = ['leden', 'únor', 'březen', 'duben', 'květen', 'červen', 'červenec', 'srpen', 'září', 'říjen', 'listopad', 'prosinec', | ||
'ledna', 'února', 'března', 'dubna', 'května', 'června', 'července', 'srpna', 'října', 'listopadu', 'prosince'] | ||
for month in MONTHS: | ||
# Rubular: https://rubular.com/r/dGLZqsbjcdJvCd | ||
self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# -*- coding: utf-8 -*- | ||
import pytest | ||
|
||
GOLDEN_CZ_RULES_TEST_CASES = [ | ||
("Jde o majite firmy ABTrade s. r. o., kteří stojí i za dalšími společnostmi, např. XYZCorp a.s.", | ||
["Jde o majite firmy ABTrade s. r. o., kteří stojí i za dalšími společnostmi, např. XYZCorp a.s."]), | ||
("„Průzkumy beru na lehkou váhu. V podstatě mě to nezajímá,“ reagoval Zeman na průzkum agentury Focus.", | ||
["„Průzkumy beru na lehkou váhu. V podstatě mě to nezajímá,“ reagoval Zeman na průzkum agentury Focus."]), | ||
("Toto se mi podařilo až na 10. pokus, ale stálo to za to.", | ||
["Toto se mi podařilo až na 10. pokus, ale stálo to za to."]), | ||
("Jde o príslušníky XII. Pluku speciálního nasazení.", | ||
["Jde o príslušníky XII. Pluku speciálního nasazení."]), | ||
("Společnost byla založena 7. dubna 2020, na smlouvě však figuruje datum 20. březen 2020.", | ||
["Společnost byla založena 7. dubna 2020, na smlouvě však figuruje datum 20. březen 2020."]), | ||
] | ||
|
||
|
||
@pytest.mark.parametrize('text,expected_sents', GOLDEN_CZ_RULES_TEST_CASES) | ||
def test_pl_sbd(cz_default_fixture, text, expected_sents): | ||
"""Czech language SBD tests""" | ||
segments = cz_default_fixture.segment(text) | ||
segments = [s.strip() for s in segments] | ||
assert segments == expected_sents |