Skip to content

Commit

Permalink
Merge pull request #304 from nikicc/async-nltk
Browse files Browse the repository at this point in the history
[ENH] Download NLTK data asynchronously
  • Loading branch information
kernc authored Nov 3, 2017
2 parents 162a1a2 + f95c48b commit e8b34d1
Show file tree
Hide file tree
Showing 15 changed files with 126 additions and 51 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/misc/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .environ import *
from .nltk_data_download import *
11 changes: 0 additions & 11 deletions orangecontrib/text/misc/environ.py

This file was deleted.

56 changes: 56 additions & 0 deletions orangecontrib/text/misc/nltk_data_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import sys
import time
from functools import wraps
from threading import Thread

import nltk
from Orange.misc.environ import data_dir_base

__all__ = ['wait_nltk_data', 'nltk_data_dir']

NLTK_DATA = [
'wordnet',
'stopwords',
'punkt',
'opinion_lexicon',
'vader_lexicon',
'averaged_perceptron_tagger',
'maxent_treebank_pos_tagger',
]


def nltk_data_dir():
""" Location where the NLTK data is stored. """
dir_ = os.path.join(data_dir_base(), 'Orange', 'nltk_data')
# make sure folder exists for ReadTheDocs
os.makedirs(dir_, exist_ok=True)
return dir_


is_done_loading = False


def _download_nltk_data():
global is_done_loading
nltk.download(NLTK_DATA, download_dir=nltk_data_dir())
is_done_loading = True
sys.stdout.flush()


Thread(target=_download_nltk_data).start()


def wait_nltk_data(func):
""" Decorator that waits until all NLTK data is downloaded. """
dir_ = nltk_data_dir()
if dir_ not in nltk.data.path: # assure NLTK knows where the data is
nltk.data.path.append(dir_)

@wraps(func)
def wrapper(*args, **kwargs):
global is_done_loading
while not is_done_loading:
time.sleep(.1)
return func(*args, **kwargs)
return wrapper
4 changes: 0 additions & 4 deletions orangecontrib/text/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@
['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications']
"""
import nltk

nltk.download(["wordnet", "stopwords", "punkt"])

from .filter import *
from .normalize import *
from .tokenize import *
Expand Down
3 changes: 3 additions & 0 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from gensim import corpora
from nltk.corpus import stopwords

from orangecontrib.text.misc import wait_nltk_data

__all__ = ['BaseTokenFilter', 'StopwordsFilter', 'LexiconFilter', 'RegexpFilter', 'FrequencyFilter']


Expand Down Expand Up @@ -65,6 +67,7 @@ class StopwordsFilter(BaseTokenFilter, WordListMixin):

supported_languages = [file.capitalize() for file in stopwords_listdir]

@wait_nltk_data
def __init__(self, language='English', word_list=None):
WordListMixin.__init__(self, word_list)
super().__init__()
Expand Down
6 changes: 6 additions & 0 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from nltk import stem

from orangecontrib.text.misc import wait_nltk_data

__all__ = ['BaseNormalizer', 'WordNetLemmatizer', 'PorterStemmer',
'SnowballStemmer', 'DictionaryLookupNormalizer']

Expand Down Expand Up @@ -35,6 +37,10 @@ class WordNetLemmatizer(BaseNormalizer):
name = 'WordNet Lemmatizer'
normalizer = stem.WordNetLemmatizer().lemmatize

@wait_nltk_data
def __init__(self):
super().__init__()


class DictionaryLookupNormalizer(BaseNormalizer):
""" Normalizes token with a <token: canonical_form> dictionary. """
Expand Down
1 change: 1 addition & 0 deletions orangecontrib/text/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
__all__ = ['Preprocessor', 'base_preprocessor']


# don't use anything that requires NLTK data to assure async download
BASE_TOKENIZER = WordPunctTokenizer()
BASE_TRANSFORMERS = [LowercaseTransformer()]

Expand Down
5 changes: 5 additions & 0 deletions orangecontrib/text/preprocess/tokenize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from nltk import tokenize

from orangecontrib.text.misc import wait_nltk_data

__all__ = ['BaseTokenizer', 'WordPunctTokenizer', 'PunktSentenceTokenizer',
'RegexpTokenizer', 'WhitespaceTokenizer', 'TweetTokenizer']
Expand Down Expand Up @@ -48,6 +49,10 @@ class PunktSentenceTokenizer(BaseTokenizer):
tokenizer = tokenize.PunktSentenceTokenizer()
name = 'Sentence'

@wait_nltk_data
def __init__(self):
super().__init__()


class WhitespaceTokenizer(BaseTokenizer):
""" Split only by whitespace. """
Expand Down
10 changes: 5 additions & 5 deletions orangecontrib/text/sentiment/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
import nltk
import numpy as np
from nltk.corpus import opinion_lexicon
from nltk.sentiment import SentimentIntensityAnalyzer

from orangecontrib.text import Corpus
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import WordPunctTokenizer
from orangecontrib.text.vectorization.base import SharedTransform, \
VectorizationComputeValue

nltk.download(["opinion_lexicon", "vader_lexicon"])


class Liu_Hu_Sentiment:
positive = set(opinion_lexicon.positive())
negative = set(opinion_lexicon.negative())
sentiments = ('sentiment',)
name = 'Liu Hu'

@wait_nltk_data
def __init__(self):
super().__init__()
self.positive = set(opinion_lexicon.positive())
self.negative = set(opinion_lexicon.negative())

def transform(self, corpus, copy=True):
scores = []
Expand Down Expand Up @@ -46,6 +45,7 @@ class Vader_Sentiment:
sentiments = ('pos', 'neg', 'neu', 'compound')
name = 'Vader'

@wait_nltk_data
def __init__(self):
super().__init__()
self.vader = SentimentIntensityAnalyzer()
Expand Down
10 changes: 4 additions & 6 deletions orangecontrib/text/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
This module provides a default `pos_tagger` that can be used for POSTagging an English corpus::
>>> from orangecontrib.text.corpus import Corpus
>>> from orangecontrib.text.tag import pos_tagger
>>> from orangecontrib.text.tag import AveragedPerceptronTagger
>>> corpus = Corpus.from_file('deerwester.tab')
>>> tagged_corpus = pos_tagger.tag_corpus(corpus)
>>> tagger = AveragedPerceptronTagger()
>>> tagged_corpus = tagger.tag_corpus(corpus)
>>> tagged_corpus.pos_tags[0] # you can use `pos_tags` attribute to access tags directly
['JJ', 'NN', 'NN', 'IN', 'NN', 'NN', 'NN', 'NNS']
>>> next(tagged_corpus.ngrams_iterator(include_postags=True)) # or `ngrams_iterator` to iterate over documents
['human_JJ', 'machine_NN', 'interface_NN', 'for_IN', 'lab_NN', 'abc_NN', 'computer_NN', 'applications_NNS']
"""

from .pos import POSTagger, StanfordPOSTagger, taggers

pos_tagger = taggers[0]
from .pos import *
24 changes: 18 additions & 6 deletions orangecontrib/text/tag/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import numpy as np

from orangecontrib.text.util import chunkable
from orangecontrib.text.misc import wait_nltk_data

nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger'])

__all__ = ['POSTagger', 'StanfordPOSTagger', 'AveragedPerceptronTagger', 'MaxEntTagger']


class POSTagger:
Expand Down Expand Up @@ -62,8 +64,18 @@ def __str__(self):
return "{} (model: {})".format(self.name, self._stanford_model)


taggers = [
POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'),
POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'),
'Treebank POS Tagger (MaxEnt)'),
]
class AveragedPerceptronTagger(POSTagger):
name = 'Averaged Perceptron Tagger'

@wait_nltk_data
def __init__(self):
super().__init__(nltk.PerceptronTagger(), self.name)


class MaxEntTagger(POSTagger):
name = 'Treebank POS Tagger (MaxEnt)'

@wait_nltk_data
def __init__(self):
tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
super().__init__(tagger, self.name)
13 changes: 8 additions & 5 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@

from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.tag import pos_tagger
from orangecontrib.text.tag import AveragedPerceptronTagger


class CorpusTests(unittest.TestCase):
def setUp(self):
self.pos_tagger = AveragedPerceptronTagger()

def test_init_preserve_shape_of_empty_x(self):
c = Corpus.from_file('book-excerpts')
d = c.domain
Expand Down Expand Up @@ -66,7 +69,7 @@ def test_extend(self):
c2 = c[:5]
self.assertEqual(len(c2), 5)
n = len(c)
pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c)
self.assertIsNot(c._tokens, None)
self.assertIsNot(c.pos_tags, None)
self.assertIs(c2._tokens, None)
Expand All @@ -77,8 +80,8 @@ def test_extend(self):
self.assertIs(c._tokens, None)
self.assertIs(c.pos_tags, None)

pos_tagger.tag_corpus(c)
pos_tagger.tag_corpus(c2)
self.pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c2)
c.extend(c2)
self.assertEqual(len(c), n + 10)
self.assertEqual(len(c._tokens), n + 10)
Expand Down Expand Up @@ -330,7 +333,7 @@ def test_ngrams_iter(self):
self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0])
self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0])

pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c)
c.ngram_range = (1, 1)
for doc in c.ngrams_iterator(join_with='_', include_postags=True):
for token in doc:
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/tests/test_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class POSTaggerTests(unittest.TestCase):
def test_POSTagger(self):
corpus = Corpus.from_file('deerwester')
tagger = tag.pos_tagger
tagger = tag.AveragedPerceptronTagger()
result = tagger.tag_corpus(corpus)
self.assertTrue(hasattr(result, 'pos_tags'))
# for token in itertools.chain(*result.tokens):
Expand All @@ -33,7 +33,7 @@ def test_str(self):

def test_preprocess(self):
pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
pos_tagger=tag.taggers[0])
pos_tagger=tag.AveragedPerceptronTagger())
corpus = Corpus.from_file('deerwester')
pr(corpus, inplace=True)
self.assertIsNotNone(corpus.pos_tags)
14 changes: 10 additions & 4 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.tag import StanfordPOSTagger
from orangecontrib.text.tag import taggers
from orangecontrib.text.tag import StanfordPOSTagger, AveragedPerceptronTagger, \
MaxEntTagger
from orangecontrib.text.widgets.utils import widgets, ResourceLoader
from orangecontrib.text.widgets.utils.concurrent import asynchronous

Expand Down Expand Up @@ -453,14 +453,20 @@ class POSTaggingModule(SingleMethodModule):
attribute = 'pos_tagger'
enabled = settings.Setting(False)

STANFORD = len(taggers)
stanford = settings.SettingProvider(ResourceLoader)

methods = taggers + [StanfordPOSTagger]
methods = [AveragedPerceptronTagger, MaxEntTagger, StanfordPOSTagger]
STANFORD = 2

initialize_methods = False

def setup_method_layout(self):
super().setup_method_layout()
# initialize all methods except StanfordPOSTagger
# cannot be done in superclass due to StanfordPOSTagger
for i, method in enumerate(self.methods[:self.STANFORD]):
self.methods[i] = method()

self.stanford = ResourceLoader(widget=self.master, model_format='Stanford model (*.model *.tagger)',
provider_format='Java file (*.jar)',
model_button_label='Model', provider_button_label='Tagger')
Expand Down
14 changes: 7 additions & 7 deletions orangecontrib/text/widgets/owsentimentanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
from orangecontrib.text.sentiment import Vader_Sentiment, Liu_Hu_Sentiment


METHODS = [Liu_Hu_Sentiment(),
Vader_Sentiment()]


class OWSentimentAnalysis(OWWidget):
name = "Sentiment Analysis"
description = "Predict sentiment from text."
Expand All @@ -30,10 +26,14 @@ class Outputs:

def __init__(self):
super().__init__()
self.METHODS = [
Liu_Hu_Sentiment(),
Vader_Sentiment()
]
self.corpus = None

gui.radioButtons(self.controlArea, self, "method_idx", box="Method",
btnLabels=[m.name for m in METHODS],
btnLabels=[m.name for m in self.METHODS],
callback=self._method_changed)

ac = gui.auto_commit(self.controlArea, self, 'autocommit', 'Commit',
Expand All @@ -51,15 +51,15 @@ def _method_changed(self):

def commit(self):
if self.corpus is not None:
method = METHODS[self.method_idx]
method = self.METHODS[self.method_idx]
out = method.transform(self.corpus)
self.Outputs.corpus.send(out)
else:
self.Outputs.corpus.send(None)

def send_report(self):
self.report_items((
('Method', METHODS[self.method_idx].name),
('Method', self.METHODS[self.method_idx].name),
))


Expand Down

0 comments on commit e8b34d1

Please sign in to comment.