Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Download NLTK data asynchronously #304

Merged
merged 4 commits into from
Nov 3, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion orangecontrib/text/misc/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .environ import *
from .nltk_data_download import *
11 changes: 0 additions & 11 deletions orangecontrib/text/misc/environ.py

This file was deleted.

56 changes: 56 additions & 0 deletions orangecontrib/text/misc/nltk_data_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import sys
import time
from functools import wraps
from threading import Thread

import nltk
from Orange.misc.environ import data_dir_base

__all__ = ['wait_nltk_data', 'nltk_data_dir']

NLTK_DATA = [
'wordnet',
'stopwords',
'punkt',
'opinion_lexicon',
'vader_lexicon',
'averaged_perceptron_tagger',
'maxent_treebank_pos_tagger',
]


def nltk_data_dir():
""" Location where the NLTK data is stored. """
dir_ = os.path.join(data_dir_base(), 'Orange', 'nltk_data')
# make sure folder exists for ReadTheDocs
os.makedirs(dir_, exist_ok=True)
return dir_


is_done_loading = False


def _download_nltk_data():
global is_done_loading
nltk.download(NLTK_DATA, download_dir=nltk_data_dir())
is_done_loading = True
sys.stdout.flush()


Thread(target=_download_nltk_data).start()


def wait_nltk_data(func):
""" Decorator that waits until all NLTK data is downloaded. """
dir_ = nltk_data_dir()
if dir_ not in nltk.data.path: # assure NLTK knows where the data is
nltk.data.path.append(dir_)

@wraps(func)
def wrapper(*args, **kwargs):
global is_done_loading
while not is_done_loading:
time.sleep(.1)
return func(*args, **kwargs)
return wrapper
4 changes: 0 additions & 4 deletions orangecontrib/text/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@
['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications']

"""
import nltk

nltk.download(["wordnet", "stopwords", "punkt"])

from .filter import *
from .normalize import *
from .tokenize import *
Expand Down
3 changes: 3 additions & 0 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from gensim import corpora
from nltk.corpus import stopwords

from orangecontrib.text.misc import wait_nltk_data

__all__ = ['BaseTokenFilter', 'StopwordsFilter', 'LexiconFilter', 'RegexpFilter', 'FrequencyFilter']


Expand Down Expand Up @@ -65,6 +67,7 @@ class StopwordsFilter(BaseTokenFilter, WordListMixin):

supported_languages = [file.capitalize() for file in stopwords_listdir]

@wait_nltk_data
def __init__(self, language='English', word_list=None):
WordListMixin.__init__(self, word_list)
super().__init__()
Expand Down
6 changes: 6 additions & 0 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from nltk import stem

from orangecontrib.text.misc import wait_nltk_data

__all__ = ['BaseNormalizer', 'WordNetLemmatizer', 'PorterStemmer',
'SnowballStemmer', 'DictionaryLookupNormalizer']

Expand Down Expand Up @@ -35,6 +37,10 @@ class WordNetLemmatizer(BaseNormalizer):
name = 'WordNet Lemmatizer'
normalizer = stem.WordNetLemmatizer().lemmatize

@wait_nltk_data
def __init__(self):
super().__init__()


class DictionaryLookupNormalizer(BaseNormalizer):
""" Normalizes token with a <token: canonical_form> dictionary. """
Expand Down
1 change: 1 addition & 0 deletions orangecontrib/text/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
__all__ = ['Preprocessor', 'base_preprocessor']


# don't use anything that requires NLTK data to assure async download
BASE_TOKENIZER = WordPunctTokenizer()
BASE_TRANSFORMERS = [LowercaseTransformer()]

Expand Down
5 changes: 5 additions & 0 deletions orangecontrib/text/preprocess/tokenize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from nltk import tokenize

from orangecontrib.text.misc import wait_nltk_data

__all__ = ['BaseTokenizer', 'WordPunctTokenizer', 'PunktSentenceTokenizer',
'RegexpTokenizer', 'WhitespaceTokenizer', 'TweetTokenizer']
Expand Down Expand Up @@ -48,6 +49,10 @@ class PunktSentenceTokenizer(BaseTokenizer):
tokenizer = tokenize.PunktSentenceTokenizer()
name = 'Sentence'

@wait_nltk_data
def __init__(self):
super().__init__()


class WhitespaceTokenizer(BaseTokenizer):
""" Split only by whitespace. """
Expand Down
10 changes: 5 additions & 5 deletions orangecontrib/text/sentiment/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
import nltk
import numpy as np
from nltk.corpus import opinion_lexicon
from nltk.sentiment import SentimentIntensityAnalyzer

from orangecontrib.text import Corpus
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import WordPunctTokenizer
from orangecontrib.text.vectorization.base import SharedTransform, \
VectorizationComputeValue

nltk.download(["opinion_lexicon", "vader_lexicon"])


class Liu_Hu_Sentiment:
positive = set(opinion_lexicon.positive())
negative = set(opinion_lexicon.negative())
sentiments = ('sentiment',)
name = 'Liu Hu'

@wait_nltk_data
def __init__(self):
super().__init__()
self.positive = set(opinion_lexicon.positive())
self.negative = set(opinion_lexicon.negative())

def transform(self, corpus, copy=True):
scores = []
Expand Down Expand Up @@ -46,6 +45,7 @@ class Vader_Sentiment:
sentiments = ('pos', 'neg', 'neu', 'compound')
name = 'Vader'

@wait_nltk_data
def __init__(self):
super().__init__()
self.vader = SentimentIntensityAnalyzer()
Expand Down
10 changes: 4 additions & 6 deletions orangecontrib/text/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
This module provides a default `pos_tagger` that can be used for POSTagging an English corpus::

>>> from orangecontrib.text.corpus import Corpus
>>> from orangecontrib.text.tag import pos_tagger
>>> from orangecontrib.text.tag import AveragedPerceptronTagger
>>> corpus = Corpus.from_file('deerwester.tab')
>>> tagged_corpus = pos_tagger.tag_corpus(corpus)
>>> tagger = AveragedPerceptronTagger()
>>> tagged_corpus = tagger.tag_corpus(corpus)
>>> tagged_corpus.pos_tags[0] # you can use `pos_tags` attribute to access tags directly
['JJ', 'NN', 'NN', 'IN', 'NN', 'NN', 'NN', 'NNS']
>>> next(tagged_corpus.ngrams_iterator(include_postags=True)) # or `ngrams_iterator` to iterate over documents
['human_JJ', 'machine_NN', 'interface_NN', 'for_IN', 'lab_NN', 'abc_NN', 'computer_NN', 'applications_NNS']


"""

from .pos import POSTagger, StanfordPOSTagger, taggers

pos_tagger = taggers[0]
from .pos import *
24 changes: 18 additions & 6 deletions orangecontrib/text/tag/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import numpy as np

from orangecontrib.text.util import chunkable
from orangecontrib.text.misc import wait_nltk_data

nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger'])

__all__ = ['POSTagger', 'StanfordPOSTagger', 'AveragedPerceptronTagger', 'MaxEntTagger']


class POSTagger:
Expand Down Expand Up @@ -62,8 +64,18 @@ def __str__(self):
return "{} (model: {})".format(self.name, self._stanford_model)


taggers = [
POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'),
POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'),
'Treebank POS Tagger (MaxEnt)'),
]
class AveragedPerceptronTagger(POSTagger):
name = 'Averaged Perceptron Tagger'

@wait_nltk_data
def __init__(self):
super().__init__(nltk.PerceptronTagger(), self.name)


class MaxEntTagger(POSTagger):
name = 'Treebank POS Tagger (MaxEnt)'

@wait_nltk_data
def __init__(self):
tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
super().__init__(tagger, self.name)
13 changes: 8 additions & 5 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@

from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.tag import pos_tagger
from orangecontrib.text.tag import AveragedPerceptronTagger


class CorpusTests(unittest.TestCase):
def setUp(self):
self.pos_tagger = AveragedPerceptronTagger()

def test_init_preserve_shape_of_empty_x(self):
c = Corpus.from_file('book-excerpts')
d = c.domain
Expand Down Expand Up @@ -66,7 +69,7 @@ def test_extend(self):
c2 = c[:5]
self.assertEqual(len(c2), 5)
n = len(c)
pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c)
self.assertIsNot(c._tokens, None)
self.assertIsNot(c.pos_tags, None)
self.assertIs(c2._tokens, None)
Expand All @@ -77,8 +80,8 @@ def test_extend(self):
self.assertIs(c._tokens, None)
self.assertIs(c.pos_tags, None)

pos_tagger.tag_corpus(c)
pos_tagger.tag_corpus(c2)
self.pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c2)
c.extend(c2)
self.assertEqual(len(c), n + 10)
self.assertEqual(len(c._tokens), n + 10)
Expand Down Expand Up @@ -330,7 +333,7 @@ def test_ngrams_iter(self):
self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0])
self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0])

pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c)
c.ngram_range = (1, 1)
for doc in c.ngrams_iterator(join_with='_', include_postags=True):
for token in doc:
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/tests/test_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class POSTaggerTests(unittest.TestCase):
def test_POSTagger(self):
corpus = Corpus.from_file('deerwester')
tagger = tag.pos_tagger
tagger = tag.AveragedPerceptronTagger()
result = tagger.tag_corpus(corpus)
self.assertTrue(hasattr(result, 'pos_tags'))
# for token in itertools.chain(*result.tokens):
Expand All @@ -33,7 +33,7 @@ def test_str(self):

def test_preprocess(self):
pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
pos_tagger=tag.taggers[0])
pos_tagger=tag.AveragedPerceptronTagger())
corpus = Corpus.from_file('deerwester')
pr(corpus, inplace=True)
self.assertIsNotNone(corpus.pos_tags)
14 changes: 10 additions & 4 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.tag import StanfordPOSTagger
from orangecontrib.text.tag import taggers
from orangecontrib.text.tag import StanfordPOSTagger, AveragedPerceptronTagger, \
MaxEntTagger
from orangecontrib.text.widgets.utils import widgets, ResourceLoader
from orangecontrib.text.widgets.utils.concurrent import asynchronous

Expand Down Expand Up @@ -453,14 +453,20 @@ class POSTaggingModule(SingleMethodModule):
attribute = 'pos_tagger'
enabled = settings.Setting(False)

STANFORD = len(taggers)
stanford = settings.SettingProvider(ResourceLoader)

methods = taggers + [StanfordPOSTagger]
methods = [AveragedPerceptronTagger, MaxEntTagger, StanfordPOSTagger]
STANFORD = 2

initialize_methods = False

def setup_method_layout(self):
super().setup_method_layout()
# initialize all methods except StanfordPOSTagger
# cannot be done in superclass due to StanfordPOSTagger
for i, method in enumerate(self.methods[:self.STANFORD]):
self.methods[i] = method()

self.stanford = ResourceLoader(widget=self.master, model_format='Stanford model (*.model *.tagger)',
provider_format='Java file (*.jar)',
model_button_label='Model', provider_button_label='Tagger')
Expand Down
14 changes: 7 additions & 7 deletions orangecontrib/text/widgets/owsentimentanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
from orangecontrib.text.sentiment import Vader_Sentiment, Liu_Hu_Sentiment


METHODS = [Liu_Hu_Sentiment(),
Vader_Sentiment()]


class OWSentimentAnalysis(OWWidget):
name = "Sentiment Analysis"
description = "Predict sentiment from text."
Expand All @@ -30,10 +26,14 @@ class Outputs:

def __init__(self):
super().__init__()
self.METHODS = [
Liu_Hu_Sentiment(),
Vader_Sentiment()
]
self.corpus = None

gui.radioButtons(self.controlArea, self, "method_idx", box="Method",
btnLabels=[m.name for m in METHODS],
btnLabels=[m.name for m in self.METHODS],
callback=self._method_changed)

ac = gui.auto_commit(self.controlArea, self, 'autocommit', 'Commit',
Expand All @@ -51,15 +51,15 @@ def _method_changed(self):

def commit(self):
if self.corpus is not None:
method = METHODS[self.method_idx]
method = self.METHODS[self.method_idx]
out = method.transform(self.corpus)
self.Outputs.corpus.send(out)
else:
self.Outputs.corpus.send(None)

def send_report(self):
self.report_items((
('Method', METHODS[self.method_idx].name),
('Method', self.METHODS[self.method_idx].name),
))


Expand Down