Skip to content

Commit

Permalink
Merge pull request #713 from PrimozGodec/lemmagen-picklable
Browse files Browse the repository at this point in the history
Make Lemmagen lemmatizer picklable
  • Loading branch information
ajdapretnar authored Sep 13, 2021
2 parents bb2337a + b2000f2 commit 947c352
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 4 deletions.
11 changes: 10 additions & 1 deletion orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,18 @@ class LemmagenLemmatizer(BaseNormalizer):

def __init__(self, language='English'):
super().__init__()
self.lemmatizer = Lemmatizer(self.lemmagen_languages[language])
self.language = language
self.lemmatizer = None

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
# lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
output_corpus = super().__call__(corpus, callback)
self.lemmatizer = None
return output_corpus

def normalizer(self, token):
assert self.lemmatizer is not None
t = self.lemmatizer.lemmatize(token)
# sometimes Lemmagen returns an empty string, return original tokens
# in this case
Expand Down
15 changes: 12 additions & 3 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,12 +302,21 @@ def test_udpipe_deepcopy(self):

def test_lemmagen(self):
normalizer = preprocess.LemmagenLemmatizer('Slovenian')
token = 'veselja'
sentence = 'Gori na gori hiša gori'
self.corpus.metas[0, 0] = sentence
self.assertEqual(
normalizer._preprocess(token),
Lemmatizer("sl").lemmatize(token)
[Lemmatizer("sl").lemmatize(t) for t in sentence.split()],
normalizer(self.corpus).tokens[0],
)

def test_normalizers_picklable(self):
""" Normalizers must be picklable, tests if it is true"""
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
normalizer = getattr(preprocess.normalize, nm)()
normalizer(self.corpus)
loaded = pickle.loads(pickle.dumps(normalizer))
loaded(self.corpus)

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer('Slovenian')
self.corpus.metas[0, 0] = 'sem'
Expand Down

0 comments on commit 947c352

Please sign in to comment.