Refactor preprocessors

biolab · Mar 19, 2020 · d410a53 · d410a53
1 parent b0f6807
commit d410a53
Show file tree

Hide file tree

Showing 27 changed files with 2,888 additions and 1,445 deletions.
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
@@ -12,6 +12,7 @@
 
 from Orange.data import ContinuousVariable, DiscreteVariable, \
     Domain, RowInstance, Table, StringVariable
+
 from orangecontrib.text.vectorization import BowVectorizer
 
 
@@ -69,6 +70,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
         self.pos_tags = None
         self.used_preprocessor = None   # required for compute values
         self._titles: Optional[np.ndarray] = None
+        self._pp_documents = None  # preprocessed documents
 
         if domain is not None and text_features is None:
             self._infer_text_features()
@@ -255,12 +257,19 @@ def extend_attributes(self, X, feature_names, feature_values=None,
 
     @property
     def documents(self):
-        """
-        Returns: a list of strings representing documents — created by joining
-            selected text features.
-        """
+        """ Returns a list of strings representing documents — created
+        by joining selected text features. """
         return self.documents_from_features(self.text_features)
 
+    @property
+    def pp_documents(self):
+        """ Preprocessed documents (transformed). """
+        return self._pp_documents or self.documents
+
+    @pp_documents.setter
+    def pp_documents(self, documents):
+        self._pp_documents = documents
+
     @property
     def titles(self):
         """ Returns a list of titles. """
@@ -298,27 +307,33 @@ def store_tokens(self, tokens, dictionary=None):
     def tokens(self):
         """
         np.ndarray: A list of lists containing tokens. If tokens are not yet
-        present, run default preprocessor and save tokens.
+        present, run default preprocessor and return tokens.
         """
         if self._tokens is None:
-            self._apply_base_preprocessor()
+            return self._base_tokens()[0]
         return self._tokens
 
     def has_tokens(self):
         """ Return whether corpus is preprocessed or not. """
         return self._tokens is not None
 
-    def _apply_base_preprocessor(self):
-        from orangecontrib.text.preprocess import base_preprocessor
-        base_preprocessor(self)
+    def _base_tokens(self):
+        from orangecontrib.text.preprocess import BASE_TRANSFORMER, \
+            BASE_TOKENIZER, PreprocessorList
+
+        # don't use anything that requires NLTK data to assure async download
+        base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
+                                               BASE_TOKENIZER])
+        corpus = base_preprocessors(self)
+        return corpus.tokens, corpus.dictionary
 
     @property
     def dictionary(self):
         """
         corpora.Dictionary: A token to id mapper.
         """
         if self._dictionary is None:
-            self._apply_base_preprocessor()
+            return self._base_tokens()[1]
         return self._dictionary
 
     def ngrams_iterator(self, join_with=' ', include_postags=False):
@@ -369,6 +384,7 @@ def copy(self):
         c.name = self.name
         c.used_preprocessor = self.used_preprocessor
         c._titles = self._titles
+        c._pp_documents = self._pp_documents
         return c
 
     @staticmethod

diff --git a/orangecontrib/text/preprocess/__init__.py b/orangecontrib/text/preprocess/__init__.py
@@ -5,19 +5,30 @@
     >>> from orangecontrib.text import Corpus
     >>> corpus = Corpus.from_file('book-excerpts')
 
-And create a :class:`Preprocessor` objects with methods you want:
+And create an instance of an arbitrary preprocessor:
 
     >>> from orangecontrib.text import preprocess
-    >>> p = preprocess.Preprocessor(transformers=[preprocess.LowercaseTransformer()],
-    ...                             tokenizer=preprocess.WordPunctTokenizer(),
-    ...                             normalizer=preprocess.SnowballStemmer('english'),
-    ...                             filters=[preprocess.StopwordsFilter('english'),
-    ...                                      preprocess.FrequencyFilter(min_df=.1)])
+    >>> p = preprocess.LowercaseTransformer()
+    >>> corpus = p(corpus)
+    >>> corpus.tokens[0][:10]
+    ['the', 'house', 'jim', 'says', 'he', 'rum', ';', 'and', 'as', 'he']
 
-Then you can apply you preprocessor to the corpus and access tokens via ``tokens`` attribute:
 
-    >>> new_corpus = p(corpus)
-    >>> new_corpus.tokens[0][:10]
+You can also create a :class:`PreprocessorList` objects with preprocessors you want:
+
+    >>> from orangecontrib.text.preprocess import PreprocessorList
+    >>> pp_list = [preprocess.LowercaseTransformer(),
+    ...            preprocess.WordPunctTokenizer(),
+    ...            preprocess.SnowballStemmer(),
+    ...            preprocess.StopwordsFilter(),
+    ...            preprocess.FrequencyFilter(min_df=.1)]
+    >>> p = PreprocessorList(pp_list)
+
+Then you can apply you preprocessors to the corpus and access tokens via ``tokens`` attribute:
+
+    >>> corpus = Corpus.from_file('book-excerpts')
+    >>> corpus = p(corpus)
+    >>> corpus.tokens[0][:10]
     ['hous', 'say', ';', 'spoke', 'littl', 'one', 'hand', 'wall', 'hurt', '?']
 
 
@@ -30,8 +41,8 @@
     ['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications']
 
 """
+from .preprocess import *
+from .tokenize import *
 from .filter import *
 from .normalize import *
-from .tokenize import *
 from .transform import *
-from .preprocess import *