From b0cfc4d17c80df6fbbe7a01fb12884371bf33a6f Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 4 Aug 2017 13:24:40 +0200 Subject: [PATCH 1/4] Async NLTK data download --- orangecontrib/text/misc/__init__.py | 2 +- orangecontrib/text/misc/environ.py | 11 ---- orangecontrib/text/misc/nltk_data_download.py | 56 +++++++++++++++++++ 3 files changed, 57 insertions(+), 12 deletions(-) delete mode 100644 orangecontrib/text/misc/environ.py create mode 100644 orangecontrib/text/misc/nltk_data_download.py diff --git a/orangecontrib/text/misc/__init__.py b/orangecontrib/text/misc/__init__.py index df18ab27b..8a4692d27 100644 --- a/orangecontrib/text/misc/__init__.py +++ b/orangecontrib/text/misc/__init__.py @@ -1 +1 @@ -from .environ import * +from .nltk_data_download import * diff --git a/orangecontrib/text/misc/environ.py b/orangecontrib/text/misc/environ.py deleted file mode 100644 index f815186cd..000000000 --- a/orangecontrib/text/misc/environ.py +++ /dev/null @@ -1,11 +0,0 @@ -import os - -from Orange.misc.environ import data_dir_base - - -def nltk_data_dir(): - """ Location where the NLTK data is stored. """ - dir_ = os.path.join(data_dir_base(), 'Orange', 'nltk_data') - # make sure folder exists for ReadTheDocs - os.makedirs(dir_, exist_ok=True) - return dir_ diff --git a/orangecontrib/text/misc/nltk_data_download.py b/orangecontrib/text/misc/nltk_data_download.py new file mode 100644 index 000000000..735fb2dfe --- /dev/null +++ b/orangecontrib/text/misc/nltk_data_download.py @@ -0,0 +1,56 @@ +import os +import sys +import time +from functools import wraps +from threading import Thread + +import nltk +from Orange.misc.environ import data_dir_base + +__all__ = ['wait_nltk_data', 'nltk_data_dir'] + +NLTK_DATA = [ + 'wordnet', + 'stopwords', + 'punkt', + 'opinion_lexicon', + 'vader_lexicon', + 'averaged_perceptron_tagger', + 'maxent_treebank_pos_tagger', +] + + +def nltk_data_dir(): + """ Location where the NLTK data is stored. """ + dir_ = os.path.join(data_dir_base(), 'Orange', 'nltk_data') + # make sure folder exists for ReadTheDocs + os.makedirs(dir_, exist_ok=True) + return dir_ + + +is_done_loading = False + + +def _download_nltk_data(): + global is_done_loading + nltk.download(NLTK_DATA, download_dir=nltk_data_dir()) + is_done_loading = True + sys.stdout.flush() + + +Thread(target=_download_nltk_data).start() + + +def wait_nltk_data(func): + """ Decorator that waits until all NLTK data is downloaded. """ + dir_ = nltk_data_dir() + if dir_ not in nltk.data.path: # assure NLTK knows where the data is + nltk.data.path.append(dir_) + + @wraps(func) + def wrapper(*args, **kwargs): + global is_done_loading + while not is_done_loading: + time.sleep(.1) + return func(*args, **kwargs) + return wrapper From f0294789336501e2a07d63db4ecf3b1ec0ed00ed Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 4 Aug 2017 13:27:40 +0200 Subject: [PATCH 2/4] Sentiment: Adopt to async NLTK data download --- orangecontrib/text/sentiment/__init__.py | 10 +++++----- orangecontrib/text/widgets/owsentimentanalysis.py | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/orangecontrib/text/sentiment/__init__.py b/orangecontrib/text/sentiment/__init__.py index c6234f1a8..874d0bfa3 100644 --- a/orangecontrib/text/sentiment/__init__.py +++ b/orangecontrib/text/sentiment/__init__.py @@ -1,24 +1,23 @@ -import nltk import numpy as np from nltk.corpus import opinion_lexicon from nltk.sentiment import SentimentIntensityAnalyzer from orangecontrib.text import Corpus +from orangecontrib.text.misc import wait_nltk_data from orangecontrib.text.preprocess import WordPunctTokenizer from orangecontrib.text.vectorization.base import SharedTransform, \ VectorizationComputeValue -nltk.download(["opinion_lexicon", "vader_lexicon"]) - class Liu_Hu_Sentiment: - positive = set(opinion_lexicon.positive()) - negative = set(opinion_lexicon.negative()) sentiments = ('sentiment',) name = 'Liu Hu' + @wait_nltk_data def __init__(self): super().__init__() + self.positive = set(opinion_lexicon.positive()) + self.negative = set(opinion_lexicon.negative()) def transform(self, corpus, copy=True): scores = [] @@ -46,6 +45,7 @@ class Vader_Sentiment: sentiments = ('pos', 'neg', 'neu', 'compound') name = 'Vader' + @wait_nltk_data def __init__(self): super().__init__() self.vader = SentimentIntensityAnalyzer() diff --git a/orangecontrib/text/widgets/owsentimentanalysis.py b/orangecontrib/text/widgets/owsentimentanalysis.py index 5bd426c1a..48635d22d 100644 --- a/orangecontrib/text/widgets/owsentimentanalysis.py +++ b/orangecontrib/text/widgets/owsentimentanalysis.py @@ -7,10 +7,6 @@ from orangecontrib.text.sentiment import Vader_Sentiment, Liu_Hu_Sentiment -METHODS = [Liu_Hu_Sentiment(), - Vader_Sentiment()] - - class OWSentimentAnalysis(OWWidget): name = "Sentiment Analysis" description = "Predict sentiment from text." @@ -30,10 +26,14 @@ class Outputs: def __init__(self): super().__init__() + self.METHODS = [ + Liu_Hu_Sentiment(), + Vader_Sentiment() + ] self.corpus = None gui.radioButtons(self.controlArea, self, "method_idx", box="Method", - btnLabels=[m.name for m in METHODS], + btnLabels=[m.name for m in self.METHODS], callback=self._method_changed) ac = gui.auto_commit(self.controlArea, self, 'autocommit', 'Commit', @@ -51,7 +51,7 @@ def _method_changed(self): def commit(self): if self.corpus is not None: - method = METHODS[self.method_idx] + method = self.METHODS[self.method_idx] out = method.transform(self.corpus) self.Outputs.corpus.send(out) else: @@ -59,7 +59,7 @@ def commit(self): def send_report(self): self.report_items(( - ('Method', METHODS[self.method_idx].name), + ('Method', self.METHODS[self.method_idx].name), )) From cc978e88658d6b66f80b700ab84d581d24667170 Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 4 Aug 2017 13:29:13 +0200 Subject: [PATCH 3/4] Preprocess: Adopt to async NLTK data download --- orangecontrib/text/preprocess/__init__.py | 4 ---- orangecontrib/text/preprocess/filter.py | 3 +++ orangecontrib/text/preprocess/normalize.py | 6 ++++++ orangecontrib/text/preprocess/preprocess.py | 1 + orangecontrib/text/preprocess/tokenize.py | 5 +++++ 5 files changed, 15 insertions(+), 4 deletions(-) diff --git a/orangecontrib/text/preprocess/__init__.py b/orangecontrib/text/preprocess/__init__.py index 9e26164d4..5ed567e67 100644 --- a/orangecontrib/text/preprocess/__init__.py +++ b/orangecontrib/text/preprocess/__init__.py @@ -30,10 +30,6 @@ ['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications'] """ -import nltk - -nltk.download(["wordnet", "stopwords", "punkt"]) - from .filter import * from .normalize import * from .tokenize import * diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index 7d7d101dd..e5fe85e13 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -5,6 +5,8 @@ from gensim import corpora from nltk.corpus import stopwords +from orangecontrib.text.misc import wait_nltk_data + __all__ = ['BaseTokenFilter', 'StopwordsFilter', 'LexiconFilter', 'RegexpFilter', 'FrequencyFilter'] @@ -65,6 +67,7 @@ class StopwordsFilter(BaseTokenFilter, WordListMixin): supported_languages = [file.capitalize() for file in stopwords_listdir] + @wait_nltk_data def __init__(self, language='English', word_list=None): WordListMixin.__init__(self, word_list) super().__init__() diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index 5c6e8b61a..8401b4ddd 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -1,5 +1,7 @@ from nltk import stem +from orangecontrib.text.misc import wait_nltk_data + __all__ = ['BaseNormalizer', 'WordNetLemmatizer', 'PorterStemmer', 'SnowballStemmer', 'DictionaryLookupNormalizer'] @@ -35,6 +37,10 @@ class WordNetLemmatizer(BaseNormalizer): name = 'WordNet Lemmatizer' normalizer = stem.WordNetLemmatizer().lemmatize + @wait_nltk_data + def __init__(self): + super().__init__() + class DictionaryLookupNormalizer(BaseNormalizer): """ Normalizes token with a dictionary. """ diff --git a/orangecontrib/text/preprocess/preprocess.py b/orangecontrib/text/preprocess/preprocess.py index 59233eecb..4a144663e 100644 --- a/orangecontrib/text/preprocess/preprocess.py +++ b/orangecontrib/text/preprocess/preprocess.py @@ -5,6 +5,7 @@ __all__ = ['Preprocessor', 'base_preprocessor'] +# don't use anything that requires NLTK data to assure async download BASE_TOKENIZER = WordPunctTokenizer() BASE_TRANSFORMERS = [LowercaseTransformer()] diff --git a/orangecontrib/text/preprocess/tokenize.py b/orangecontrib/text/preprocess/tokenize.py index 19e014539..e6f1dd769 100644 --- a/orangecontrib/text/preprocess/tokenize.py +++ b/orangecontrib/text/preprocess/tokenize.py @@ -1,6 +1,7 @@ import re from nltk import tokenize +from orangecontrib.text.misc import wait_nltk_data __all__ = ['BaseTokenizer', 'WordPunctTokenizer', 'PunktSentenceTokenizer', 'RegexpTokenizer', 'WhitespaceTokenizer', 'TweetTokenizer'] @@ -48,6 +49,10 @@ class PunktSentenceTokenizer(BaseTokenizer): tokenizer = tokenize.PunktSentenceTokenizer() name = 'Sentence' + @wait_nltk_data + def __init__(self): + super().__init__() + class WhitespaceTokenizer(BaseTokenizer): """ Split only by whitespace. """ From f95c48b93ddaf6b489725ad7f648ce3e95f253c5 Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 4 Aug 2017 14:25:05 +0200 Subject: [PATCH 4/4] POS: Adopt to async NLTK data download --- orangecontrib/text/tag/__init__.py | 10 ++++----- orangecontrib/text/tag/pos.py | 24 ++++++++++++++++------ orangecontrib/text/tests/test_corpus.py | 13 +++++++----- orangecontrib/text/tests/test_tags.py | 4 ++-- orangecontrib/text/widgets/owpreprocess.py | 14 +++++++++---- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/orangecontrib/text/tag/__init__.py b/orangecontrib/text/tag/__init__.py index 12f4c854e..ffb43ab5e 100644 --- a/orangecontrib/text/tag/__init__.py +++ b/orangecontrib/text/tag/__init__.py @@ -5,17 +5,15 @@ This module provides a default `pos_tagger` that can be used for POSTagging an English corpus:: >>> from orangecontrib.text.corpus import Corpus - >>> from orangecontrib.text.tag import pos_tagger + >>> from orangecontrib.text.tag import AveragedPerceptronTagger >>> corpus = Corpus.from_file('deerwester.tab') - >>> tagged_corpus = pos_tagger.tag_corpus(corpus) + >>> tagger = AveragedPerceptronTagger() + >>> tagged_corpus = tagger.tag_corpus(corpus) >>> tagged_corpus.pos_tags[0] # you can use `pos_tags` attribute to access tags directly ['JJ', 'NN', 'NN', 'IN', 'NN', 'NN', 'NN', 'NNS'] >>> next(tagged_corpus.ngrams_iterator(include_postags=True)) # or `ngrams_iterator` to iterate over documents ['human_JJ', 'machine_NN', 'interface_NN', 'for_IN', 'lab_NN', 'abc_NN', 'computer_NN', 'applications_NNS'] - """ -from .pos import POSTagger, StanfordPOSTagger, taggers - -pos_tagger = taggers[0] +from .pos import * diff --git a/orangecontrib/text/tag/pos.py b/orangecontrib/text/tag/pos.py index e126783ee..ff39e2343 100644 --- a/orangecontrib/text/tag/pos.py +++ b/orangecontrib/text/tag/pos.py @@ -2,8 +2,10 @@ import numpy as np from orangecontrib.text.util import chunkable +from orangecontrib.text.misc import wait_nltk_data -nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger']) + +__all__ = ['POSTagger', 'StanfordPOSTagger', 'AveragedPerceptronTagger', 'MaxEntTagger'] class POSTagger: @@ -62,8 +64,18 @@ def __str__(self): return "{} (model: {})".format(self.name, self._stanford_model) -taggers = [ - POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'), - POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'), - 'Treebank POS Tagger (MaxEnt)'), -] +class AveragedPerceptronTagger(POSTagger): + name = 'Averaged Perceptron Tagger' + + @wait_nltk_data + def __init__(self): + super().__init__(nltk.PerceptronTagger(), self.name) + + +class MaxEntTagger(POSTagger): + name = 'Treebank POS Tagger (MaxEnt)' + + @wait_nltk_data + def __init__(self): + tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle') + super().__init__(tagger, self.name) diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index b9fc48086..55576059a 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -10,10 +10,13 @@ from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus -from orangecontrib.text.tag import pos_tagger +from orangecontrib.text.tag import AveragedPerceptronTagger class CorpusTests(unittest.TestCase): + def setUp(self): + self.pos_tagger = AveragedPerceptronTagger() + def test_init_preserve_shape_of_empty_x(self): c = Corpus.from_file('book-excerpts') d = c.domain @@ -66,7 +69,7 @@ def test_extend(self): c2 = c[:5] self.assertEqual(len(c2), 5) n = len(c) - pos_tagger.tag_corpus(c) + self.pos_tagger.tag_corpus(c) self.assertIsNot(c._tokens, None) self.assertIsNot(c.pos_tags, None) self.assertIs(c2._tokens, None) @@ -77,8 +80,8 @@ def test_extend(self): self.assertIs(c._tokens, None) self.assertIs(c.pos_tags, None) - pos_tagger.tag_corpus(c) - pos_tagger.tag_corpus(c2) + self.pos_tagger.tag_corpus(c) + self.pos_tagger.tag_corpus(c2) c.extend(c2) self.assertEqual(len(c), n + 10) self.assertEqual(len(c._tokens), n + 10) @@ -330,7 +333,7 @@ def test_ngrams_iter(self): self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0]) self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0]) - pos_tagger.tag_corpus(c) + self.pos_tagger.tag_corpus(c) c.ngram_range = (1, 1) for doc in c.ngrams_iterator(join_with='_', include_postags=True): for token in doc: diff --git a/orangecontrib/text/tests/test_tags.py b/orangecontrib/text/tests/test_tags.py index 34d272e9d..1300bf764 100644 --- a/orangecontrib/text/tests/test_tags.py +++ b/orangecontrib/text/tests/test_tags.py @@ -10,7 +10,7 @@ class POSTaggerTests(unittest.TestCase): def test_POSTagger(self): corpus = Corpus.from_file('deerwester') - tagger = tag.pos_tagger + tagger = tag.AveragedPerceptronTagger() result = tagger.tag_corpus(corpus) self.assertTrue(hasattr(result, 'pos_tags')) # for token in itertools.chain(*result.tokens): @@ -33,7 +33,7 @@ def test_str(self): def test_preprocess(self): pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'), - pos_tagger=tag.taggers[0]) + pos_tagger=tag.AveragedPerceptronTagger()) corpus = Corpus.from_file('deerwester') pr(corpus, inplace=True) self.assertIsNotNone(corpus.pos_tags) diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index f38adf05b..6334b1003 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -12,8 +12,8 @@ from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus from orangecontrib.text.misc import nltk_data_dir -from orangecontrib.text.tag import StanfordPOSTagger -from orangecontrib.text.tag import taggers +from orangecontrib.text.tag import StanfordPOSTagger, AveragedPerceptronTagger, \ + MaxEntTagger from orangecontrib.text.widgets.utils import widgets, ResourceLoader from orangecontrib.text.widgets.utils.concurrent import asynchronous @@ -453,14 +453,20 @@ class POSTaggingModule(SingleMethodModule): attribute = 'pos_tagger' enabled = settings.Setting(False) - STANFORD = len(taggers) stanford = settings.SettingProvider(ResourceLoader) - methods = taggers + [StanfordPOSTagger] + methods = [AveragedPerceptronTagger, MaxEntTagger, StanfordPOSTagger] + STANFORD = 2 + initialize_methods = False def setup_method_layout(self): super().setup_method_layout() + # initialize all methods except StanfordPOSTagger + # cannot be done in superclass due to StanfordPOSTagger + for i, method in enumerate(self.methods[:self.STANFORD]): + self.methods[i] = method() + self.stanford = ResourceLoader(widget=self.master, model_format='Stanford model (*.model *.tagger)', provider_format='Java file (*.jar)', model_button_label='Model', provider_button_label='Tagger')