Skip to content

Commit

Permalink
Refactor preprocessors
Browse files Browse the repository at this point in the history
  • Loading branch information
VesnaT committed Mar 19, 2020
1 parent b0f6807 commit d410a53
Show file tree
Hide file tree
Showing 27 changed files with 2,888 additions and 1,445 deletions.
36 changes: 26 additions & 10 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from Orange.data import ContinuousVariable, DiscreteVariable, \
Domain, RowInstance, Table, StringVariable

from orangecontrib.text.vectorization import BowVectorizer


Expand Down Expand Up @@ -69,6 +70,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.pos_tags = None
self.used_preprocessor = None # required for compute values
self._titles: Optional[np.ndarray] = None
self._pp_documents = None # preprocessed documents

if domain is not None and text_features is None:
self._infer_text_features()
Expand Down Expand Up @@ -255,12 +257,19 @@ def extend_attributes(self, X, feature_names, feature_values=None,

@property
def documents(self):
"""
Returns: a list of strings representing documents — created by joining
selected text features.
"""
""" Returns a list of strings representing documents — created
by joining selected text features. """
return self.documents_from_features(self.text_features)

@property
def pp_documents(self):
""" Preprocessed documents (transformed). """
return self._pp_documents or self.documents

@pp_documents.setter
def pp_documents(self, documents):
self._pp_documents = documents

@property
def titles(self):
""" Returns a list of titles. """
Expand Down Expand Up @@ -298,27 +307,33 @@ def store_tokens(self, tokens, dictionary=None):
def tokens(self):
"""
np.ndarray: A list of lists containing tokens. If tokens are not yet
present, run default preprocessor and save tokens.
present, run default preprocessor and return tokens.
"""
if self._tokens is None:
self._apply_base_preprocessor()
return self._base_tokens()[0]
return self._tokens

def has_tokens(self):
""" Return whether corpus is preprocessed or not. """
return self._tokens is not None

def _apply_base_preprocessor(self):
from orangecontrib.text.preprocess import base_preprocessor
base_preprocessor(self)
def _base_tokens(self):
from orangecontrib.text.preprocess import BASE_TRANSFORMER, \
BASE_TOKENIZER, PreprocessorList

# don't use anything that requires NLTK data to assure async download
base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
BASE_TOKENIZER])
corpus = base_preprocessors(self)
return corpus.tokens, corpus.dictionary

@property
def dictionary(self):
"""
corpora.Dictionary: A token to id mapper.
"""
if self._dictionary is None:
self._apply_base_preprocessor()
return self._base_tokens()[1]
return self._dictionary

def ngrams_iterator(self, join_with=' ', include_postags=False):
Expand Down Expand Up @@ -369,6 +384,7 @@ def copy(self):
c.name = self.name
c.used_preprocessor = self.used_preprocessor
c._titles = self._titles
c._pp_documents = self._pp_documents
return c

@staticmethod
Expand Down
33 changes: 22 additions & 11 deletions orangecontrib/text/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,30 @@
>>> from orangecontrib.text import Corpus
>>> corpus = Corpus.from_file('book-excerpts')
And create a :class:`Preprocessor` objects with methods you want:
And create an instance of an arbitrary preprocessor:
>>> from orangecontrib.text import preprocess
>>> p = preprocess.Preprocessor(transformers=[preprocess.LowercaseTransformer()],
... tokenizer=preprocess.WordPunctTokenizer(),
... normalizer=preprocess.SnowballStemmer('english'),
... filters=[preprocess.StopwordsFilter('english'),
... preprocess.FrequencyFilter(min_df=.1)])
>>> p = preprocess.LowercaseTransformer()
>>> corpus = p(corpus)
>>> corpus.tokens[0][:10]
['the', 'house', 'jim', 'says', 'he', 'rum', ';', 'and', 'as', 'he']
Then you can apply you preprocessor to the corpus and access tokens via ``tokens`` attribute:
>>> new_corpus = p(corpus)
>>> new_corpus.tokens[0][:10]
You can also create a :class:`PreprocessorList` objects with preprocessors you want:
>>> from orangecontrib.text.preprocess import PreprocessorList
>>> pp_list = [preprocess.LowercaseTransformer(),
... preprocess.WordPunctTokenizer(),
... preprocess.SnowballStemmer(),
... preprocess.StopwordsFilter(),
... preprocess.FrequencyFilter(min_df=.1)]
>>> p = PreprocessorList(pp_list)
Then you can apply you preprocessors to the corpus and access tokens via ``tokens`` attribute:
>>> corpus = Corpus.from_file('book-excerpts')
>>> corpus = p(corpus)
>>> corpus.tokens[0][:10]
['hous', 'say', ';', 'spoke', 'littl', 'one', 'hand', 'wall', 'hurt', '?']
Expand All @@ -30,8 +41,8 @@
['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications']
"""
from .preprocess import *
from .tokenize import *
from .filter import *
from .normalize import *
from .tokenize import *
from .transform import *
from .preprocess import *
Loading

0 comments on commit d410a53

Please sign in to comment.