From 94d3a97a99f741849bc91b2e984c7b527c6e2902 Mon Sep 17 00:00:00 2001
From: PrimozGodec <p.godec9@gmail.com>
Date: Tue, 24 Oct 2023 11:24:16 +0200
Subject: [PATCH] Statistis - Select statistic computation source

---
 orangecontrib/text/widgets/owstatistics.py    | 277 ++++++++++--------
 .../text/widgets/tests/test_owstatistics.py   | 199 +++++++++----
 2 files changed, 309 insertions(+), 167 deletions(-)

diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py
index f465ee3ac..eb4296d8f 100644
--- a/orangecontrib/text/widgets/owstatistics.py
+++ b/orangecontrib/text/widgets/owstatistics.py
@@ -3,7 +3,7 @@
 from copy import copy
 from itertools import groupby
 from string import punctuation
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple, Union, Iterator, Dict
 
 import numpy as np
 from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit, QSizePolicy
@@ -14,19 +14,18 @@
 from Orange.widgets.utils.widgetpreview import WidgetPreview
 from Orange.widgets.widget import Input, Output, OWWidget
 from nltk import tokenize
+from orangecanvas.gui.utils import disconnected
 from orangewidget.widget import Msg
 
 from orangecontrib.text import Corpus
 
-# those functions are implemented here since they are used in more statistics
-from orangecontrib.text.preprocess import (
-    LowercaseTransformer,
-    RegexpTokenizer,
-    PreprocessorList
-)
 
+class Sources:
+    DOCUMENTS = "Documents"
+    TOKENS = "Preprocessed tokens"  # tokens or ngrams - depending on statistic
 
-def num_words(document: str, callback: Callable) -> int:
+
+def num_words(document: Union[str, List], callback: Callable) -> int:
     """
     Return number of words in document-string. Word is every entity divided by
     space, tab, newline.
@@ -35,11 +34,13 @@ def num_words(document: str, callback: Callable) -> int:
     return len(document.split())
 
 
-def char_count(document: str, callback: Callable) -> int:
+def char_count(document: Union[str, List], callback: Callable) -> int:
     """
     Count number of alpha-numerical in document/string.
     """
     callback()
+    if isinstance(document, List):
+        document = "".join(document)
     return sum(c.isalnum() for c in document)
 
 
@@ -52,37 +53,32 @@ def digit_count(document: str, callback: Callable) -> int:
 
 
 def count_appearances(
-    document: str, characters: List[str], callback: Callable
+    document: Union[str, List], characters: List[str], callback: Callable
 ) -> int:
     """
     Count number of appearances of chars from `characters` list.
     """
     callback()
     # I think it supports the majority of main languages
-    # Y can be vowel too sometimes - it is not possible to distinguish
-    return sum(document.lower().count(c) for c in characters)
+    # Y can be vo wel too sometimes - it is not possible to distinguish
+    if isinstance(document, str):
+        return sum(document.lower().count(c) for c in characters)
+    else:
+        return sum(d.lower().count(c) for c in characters for d in document)
 
 
-def preprocess_only_words(corpus: Corpus) -> Corpus:
+def get_source(corpus: Corpus, source: str) -> Union[List[str], Iterator[List[str]]]:
     """
-    Apply the preprocessor that splits words, transforms them to lower case
-    (and removes punctuations).
-
-    Parameters
-    ----------
-    corpus
-        Corpus on which the preprocessor will be applied.
-
-    Returns
-    -------
-    Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
+    Extract source from corpus according to source variable:
+    - if source == Sources.DOCUMENTS return documents
+    - if source == Sources.TOKENS return ngrams
     """
-    p = PreprocessorList(
-        [LowercaseTransformer(),
-         # by default regexp keeps only words (no punctuations, no spaces)
-         RegexpTokenizer()]
-    )
-    return p(corpus)
+    if source == "Documents":
+        return corpus.documents
+    elif source == "Preprocessed tokens":
+        return corpus.ngrams
+    else:
+        raise ValueError(f"Wrong source {source}")
 
 
 # every statistic returns a np.ndarray with statistics
@@ -91,38 +87,34 @@ def preprocess_only_words(corpus: Corpus) -> Corpus:
 
 
 def words_count(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count number of words in each document.
     """
-    corpus = preprocess_only_words(corpus)
+    assert source == Sources.DOCUMENTS
     # np.c_ makes column vector (ndarray) out of the list
     # [1, 2, 3] -> [[1], [2], [3]]
-    return (
-        np.c_[[num_words(d, callback) for d in corpus.documents]],
-        ["Word count"],
-    )
+    return np.c_[[num_words(d, callback) for d in corpus.documents]], ["Word count"]
 
 
 def characters_count(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count number of characters without spaces, newlines, tabs, ...
     """
-    return (
-        np.c_[[char_count(d, callback) for d in corpus.documents]],
-        ["Character count"],
-    )
+    source = get_source(corpus, source)
+    return np.c_[[char_count(d, callback) for d in source]], ["Character count"]
 
 
 def n_gram_count(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count number of n-grams in every document
     """
+    assert source == Sources.TOKENS
 
     def ng_count(n_gram: List[str]):
         callback()
@@ -132,11 +124,12 @@ def ng_count(n_gram: List[str]):
 
 
 def average_word_len(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Computes word density as: word count / character count + 1
     """
+    assert source == Sources.DOCUMENTS
     return (
         np.c_[
             [
@@ -149,11 +142,12 @@ def average_word_len(
 
 
 def punctuation_count(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count number of punctuation signs
     """
+    assert source == Sources.DOCUMENTS
 
     def num_punctuation(document: str):
         callback()
@@ -166,11 +160,12 @@ def num_punctuation(document: str):
 
 
 def capital_count(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count number of capital letters in documents
     """
+    assert source == Sources.DOCUMENTS
 
     def num_capitals(document: str):
         callback()
@@ -183,11 +178,13 @@ def num_capitals(document: str):
 
 
 def vowel_count(
-    corpus: Corpus, vowels: str, callback: Callable
+    corpus: Corpus, vowels: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count number of vowels in documents
     """
+    assert source == Sources.DOCUMENTS
+
     # comma separated string of vowels to list
     vowels = [v.strip() for v in vowels.split(",")]
     return (
@@ -199,12 +196,14 @@ def vowel_count(
 
 
 def consonant_count(
-    corpus: Corpus, consonants: str, callback: Callable
+    corpus: Corpus, consonants: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count number of consonants in documents. Consonants are all alnum
     characters except vowels and numbers
     """
+    assert source == Sources.DOCUMENTS
+
     # comma separated string of consonants to list
     consonants = [v.strip() for v in consonants.split(",")]
     return (
@@ -219,12 +218,12 @@ def consonant_count(
 
 
 def per_cent_unique_words(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Ratio between unique words count and all words count
     """
-    corpus = preprocess_only_words(corpus)
+    assert source == Sources.TOKENS
 
     def perc_unique(tokens: str):
         callback()
@@ -232,83 +231,84 @@ def perc_unique(tokens: str):
             return np.nan
         return len(set(tokens)) / len(tokens)
 
-    return np.c_[list(map(perc_unique, corpus.tokens))], ["% unique words"]
+    return np.c_[list(map(perc_unique, corpus.ngrams))], ["% unique words"]
 
 
 def starts_with(
-    corpus: Corpus, prefix: str, callback: Callable
+    corpus: Corpus, prefix: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Number of words that starts with the string in `prefix`.
     """
-    corpus = preprocess_only_words(corpus)
+    assert source == Sources.TOKENS
 
     def number_starts_with(tokens: List[str]):
         callback()
         return sum(t.startswith(prefix) for t in tokens)
 
     return (
-        np.c_[list(map(number_starts_with, corpus.tokens))],
+        np.c_[list(map(number_starts_with, corpus.ngrams))],
         [f"Starts with {prefix}"],
     )
 
 
 def ends_with(
-    corpus: Corpus, postfix: str, callback: Callable
+    corpus: Corpus, postfix: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Number of words that ends with the string in `postfix`.
     """
-    corpus = preprocess_only_words(corpus)
+    assert source == Sources.TOKENS
 
     def number_ends_with(tokens: List[str]):
         callback()
         return sum(t.endswith(postfix) for t in tokens)
 
     return (
-        np.c_[list(map(number_ends_with, corpus.tokens))],
+        np.c_[list(map(number_ends_with, corpus.ngrams))],
         [f"Ends with {postfix}"],
     )
 
 
 def contains(
-    corpus: Corpus, text: str, callback: Callable
+    corpus: Corpus, text: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Number of words that contains string in `text`.
     """
+    source = get_source(corpus, source)
     return (
-        np.c_[
-            [count_appearances(d, [text], callback) for d in corpus.documents]
-        ],
+        np.c_[[count_appearances(d, [text], callback) for d in source]],
         [f"Contains {text}"],
     )
 
 
 def regex(
-    corpus: Corpus, expression: str, callback: Callable
+    corpus: Corpus, expression: str, source: str, callback: Callable
 ) -> Tuple[np.ndarray, List[str]]:
     """
     Count occurrences of pattern in `expression`.
     """
     pattern = re.compile(expression)
 
-    def number_regex(tokens: List[str]):
+    def regex_matches(text: Union[str, List]):
         callback()
-        return sum(bool(pattern.match(t)) for t in tokens)
+        if isinstance(text, str):
+            return len(re.findall(pattern, text))
+        else:
+            return sum(len(re.findall(pattern, ngram)) for ngram in text)
 
-    return (
-        np.c_[list(map(number_regex, corpus.tokens))],
-        [f"Regex {expression}"],
-    )
+    source = get_source(corpus, source)
+    return np.c_[list(map(regex_matches, source))], [f"Regex {expression}"]
 
 
 def pos_tags(
-    corpus: Corpus, pos_tags: str, callback: Callable
+    corpus: Corpus, pos_tags: str, source: str, callback: Callable
 ) -> Optional[Tuple[np.ndarray, List[str]]]:
     """
     Count number of specified pos tags in corpus
     """
+    assert source == Sources.TOKENS
     p_tags = [v.strip().lower() for v in pos_tags.split(",")]
 
     def cust_count(tags):
@@ -325,7 +325,7 @@ def cust_count(tags):
 
 
 def yule(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Optional[Tuple[np.ndarray, List[str]]]:
     """
     Yule's I measure: higher number is higher diversity - richer vocabulary
@@ -333,6 +333,7 @@ def yule(
     Mathematical Proceedings of the Cambridge Philosophical Society, 42(2), B1-B2.
     doi:10.1017/S0305004100022799
     """
+    assert source == Sources.TOKENS
     if corpus.pos_tags is None:
         return None
 
@@ -354,13 +355,13 @@ def yules_i(tags):
 
 
 def lix(
-    corpus: Corpus, _: str, callback: Callable
+    corpus: Corpus, _: str, source: str, callback: Callable
 ) -> Optional[Tuple[np.ndarray, List[str]]]:
     """
     Readability index LIX
     https://en.wikipedia.org/wiki/Lix_(readability_test)
     """
-    corpus = preprocess_only_words(corpus)
+    assert source == Sources.TOKENS
     tokenizer = tokenize.PunktSentenceTokenizer()
 
     def lix_index(document, tokens):
@@ -393,18 +394,21 @@ class ComputeValue:
     pattern
         Some statistics need additional parameter with the pattern
         (e.g. starts with), for others it is set to empty string.
+    source
+        Part of the corpus used for computation: either tokens/ngrams or whole documents
     """
 
-    def __init__(self, function: Callable, pattern: str) -> None:
+    def __init__(self, function: Callable, pattern: str, source: str) -> None:
         self.function = function
         self.pattern = pattern
+        self.source = source
 
     def __call__(self, data: Corpus) -> np.ndarray:
         """
         This function compute values on new table.
         """
         # lambda is added as a placeholder for a callback.
-        return self.function(data, self.pattern, lambda: True)[0]
+        return self.function(data, self.pattern, self.source, lambda: True)[0]
 
     def __eq__(self, other):
         return self.function == other.function and self.pattern == other.pattern
@@ -419,30 +423,32 @@ def __hash__(self):
 STATISTICS = [
     # (name of the statistics, function to compute, default value)
     # if default value is None - text box is not required
-    ("Word count", words_count, None),
-    ("Character count", characters_count, None),
-    ("N-gram count", n_gram_count, None),
-    ("Average word length", average_word_len, None),
-    ("Punctuation count", punctuation_count, None),
-    ("Capital letter count", capital_count, None),
-    ("Vowel count", vowel_count, "a,e,i,o,u"),
+    ("Word count", words_count, None, (Sources.DOCUMENTS,)),
+    ("Character count", characters_count, None, (Sources.DOCUMENTS, Sources.TOKENS)),
+    ("N-gram count", n_gram_count, None, (Sources.TOKENS,)),
+    ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)),
+    ("Punctuation count", punctuation_count, None, (Sources.DOCUMENTS,)),
+    ("Capital letter count", capital_count, None, (Sources.DOCUMENTS,)),
+    ("Vowel count", vowel_count, "a,e,i,o,u", (Sources.DOCUMENTS,)),
     (
         "Consonant count",
         consonant_count,
         "b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z",
+        (Sources.DOCUMENTS,),
     ),
-    ("Per cent unique words", per_cent_unique_words, None),
-    ("Starts with", starts_with, ""),
-    ("Ends with", ends_with, ""),
-    ("Contains", contains, ""),
-    ("Regex", regex, ""),
-    ("POS tag", pos_tags, "NN,VV,JJ"),
-    ("Yule's I", yule, None),
-    ("LIX index", lix, None),
+    ("Per cent unique terms", per_cent_unique_words, None, (Sources.TOKENS,)),
+    ("Starts with", starts_with, "", (Sources.TOKENS,)),
+    ("Ends with", ends_with, "", (Sources.TOKENS,)),
+    ("Contains", contains, "", (Sources.DOCUMENTS, Sources.TOKENS)),
+    ("Regex", regex, "", (Sources.DOCUMENTS, Sources.TOKENS)),
+    ("POS tag", pos_tags, "NN,VV,JJ", (Sources.TOKENS,)),
+    ("Yule's I", yule, None, (Sources.TOKENS,)),
+    ("LIX index", lix, None, (Sources.TOKENS,)),
 ]
 STATISTICS_NAMES = list(list(zip(*STATISTICS))[0])
 STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1])
 STATISTICS_DEFAULT_VALUE = list(list(zip(*STATISTICS))[2])
+STATISTICS_DEFAULT_SOURCES = list(list(zip(*STATISTICS))[3])
 
 
 def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
@@ -466,12 +472,12 @@ def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
     def advance():
         state.set_progress_value(next(tick_values))
 
-    for s, patern in statistics:
+    for s, patern, source in statistics:
         fun = STATISTICS_FUNCTIONS[s]
-        result = fun(corpus, patern, advance)
+        result = fun(corpus, patern, source, advance)
         if result is not None:
-            result = result + (ComputeValue(fun, patern),)
-        state.set_partial_result((s, patern, result))
+            result = result + (ComputeValue(fun, patern, source),)
+        state.set_partial_result((s, patern, source, result))
 
 
 class OWStatistics(OWWidget, ConcurrentWidgetMixin):
@@ -491,12 +497,14 @@ class Warning(OWWidget.Warning):
             "{} statistics cannot be computed and is omitted from results."
         )
 
+    # todo: update settings version and migration
     want_main_area = False
     mainArea_width_height_ratio = None
 
-    # settings
-    default_rules = [(0, ""), (1, "")]  # rules used to reset the active rules
-    active_rules: List[Tuple[int, str]] = Setting(default_rules[:])
+    settings_version = 2
+    # rules used to reset the active rules
+    default_rules = [(0, "", STATISTICS[0][-1][0]), (1, "", STATISTICS[0][-1][0])]
+    active_rules: List[Tuple[int, str, str]] = Setting(default_rules[:])
     # rules active at time of apply clicked
     applied_rules: Optional[List[Tuple[int, str]]] = None
 
@@ -507,12 +515,14 @@ def __init__(self) -> None:
         ConcurrentWidgetMixin.__init__(self)
         self.corpus = None
 
-        # the list with combos from the widget
-        self.combos = []
+        # the list with combos for selecting statistics from the widget
+        self.statistics_combos = []
         # the list with line edits from the widget
         self.line_edits = []
         # the list of buttons in front of controls that removes them
         self.remove_buttons = []
+        # the list with combos for selecting on what statistics computes
+        self.source_combos = []
 
         self._init_controls()
 
@@ -542,6 +552,7 @@ def _init_statistics_box(self) -> None:
         grid.setColumnStretch(2, 100)
         grid.addWidget(QLabel("Feature"), 0, 1)
         grid.addWidget(QLabel("Pattern"), 0, 2)
+        grid.addWidget(QLabel("Compute for"), 0, 3)
 
         gui.button(
             box,
@@ -562,7 +573,7 @@ def adjust_n_rule_rows(self) -> None:
         """
 
         def _add_line():
-            n_lines = len(self.combos) + 1
+            n_lines = len(self.statistics_combos) + 1
 
             # add delete symbol
             button = gui.button(
@@ -577,23 +588,29 @@ def _add_line():
             combo.addItems(STATISTICS_NAMES)
             combo.currentIndexChanged.connect(self._sync_edit_combo)
             self.rules_grid.addWidget(combo, n_lines, 1)
-            self.combos.append(combo)
+            self.statistics_combos.append(combo)
 
-            # add line edit for patern
+            # add line edit for pattern
             line_edit = QLineEdit()
             self.rules_grid.addWidget(line_edit, n_lines, 2)
             line_edit.textChanged.connect(self._sync_edit_line)
             self.line_edits.append(line_edit)
 
+            # add statistics type dropdown
+            combo = QComboBox()
+            combo.currentIndexChanged.connect(self._sync_edit_source_combo)
+            self.rules_grid.addWidget(combo, n_lines, 3)
+            self.source_combos.append(combo)
+
         def _remove_line():
-            self.combos.pop().deleteLater()
+            self.statistics_combos.pop().deleteLater()
             self.line_edits.pop().deleteLater()
+            self.source_combos.pop().deleteLater()
             self.remove_buttons.pop().deleteLater()
 
         def _fix_tab_order():
-            # TODO: write it differently - check create class
-            for i, (r, c, l) in enumerate(
-                zip(self.active_rules, self.combos, self.line_edits)
+            for i, (r, c, l, s) in enumerate(
+                zip(self.active_rules, self.statistics_combos, self.line_edits, self.source_combos)
             ):
                 c.setCurrentIndex(r[0])  # update combo
                 l.setText(r[1])  # update line edit
@@ -601,17 +618,21 @@ def _fix_tab_order():
                     l.setVisible(True)
                 else:
                     l.setVisible(False)
+                with disconnected(s.currentIndexChanged, self._sync_edit_source_combo):
+                    s.clear()
+                    s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]])
+                    s.setCurrentText(r[2])
 
         n = len(self.active_rules)
-        while n > len(self.combos):
+        while n > len(self.statistics_combos):
             _add_line()
-        while len(self.combos) > n:
+        while len(self.statistics_combos) > n:
             _remove_line()
         _fix_tab_order()
 
     def _add_row(self) -> None:
         """ Add a new row to the statistic box """
-        self.active_rules.append((0, ""))
+        self.active_rules.append((0, "", STATISTICS_DEFAULT_SOURCES[0][0]))
         self.adjust_n_rule_rows()
 
     def _remove_row(self) -> None:
@@ -623,20 +644,27 @@ def _remove_row(self) -> None:
     def _sync_edit_combo(self) -> None:
         """ Update rules when combo value changed """
         combo = self.sender()
-        edit_index = self.combos.index(combo)
+        edit_index = self.statistics_combos.index(combo)
         selected_i = combo.currentIndex()
-        default_value = STATISTICS_DEFAULT_VALUE[selected_i]
-        self.active_rules[edit_index] = (selected_i, default_value)
+        default_value = STATISTICS_DEFAULT_VALUE[selected_i] or ""
+        default_source = STATISTICS_DEFAULT_SOURCES[selected_i][0]
+        self.active_rules[edit_index] = (selected_i, default_value, default_source)
         self.adjust_n_rule_rows()
 
     def _sync_edit_line(self) -> None:
         """ Update rules when line edit value changed """
         line_edit = self.sender()
         edit_index = self.line_edits.index(line_edit)
-        self.active_rules[edit_index] = (
-            self.active_rules[edit_index][0],
-            line_edit.text(),
-        )
+        arules = self.active_rules[edit_index]
+        self.active_rules[edit_index] = (arules[0], line_edit.text(), arules[2])
+
+    def _sync_edit_source_combo(self) -> None:
+        """ Update rules when source value change """
+        combo = self.sender()
+        edit_index = self.source_combos.index(combo)
+        value = combo.currentText()
+        arules = self.active_rules[edit_index]
+        self.active_rules[edit_index] = (arules[0], arules[1], value)
 
     @Inputs.corpus
     def set_data(self, corpus) -> None:
@@ -666,10 +694,10 @@ def on_exception(self, exception: Exception) -> None:
         raise exception
 
     def on_partial_result(
-        self, result: Tuple[int, str, Tuple[np.ndarray, List[str], Callable]]
+        self, result: Tuple[int, str, str, Tuple[np.ndarray, List[str], Callable]]
     ) -> None:
-        statistic, patern, result = result
-        self.result_dict[(statistic, patern)] = result
+        statistic, patern, source, result = result
+        self.result_dict[(statistic, patern, source)] = result
 
     def on_done(self, result: None) -> None:
         # join results
@@ -707,6 +735,21 @@ def output_results(self) -> None:
         )
         self.Outputs.corpus.send(new_corpus)
 
+    @classmethod
+    def migrate_settings(cls, settings: Dict, version: int):
+        def def_source(idx):
+            """Return source that behaviour is the most similar to previous version"""
+            if STATISTICS_NAMES[idx] == "Regex":
+                # regex was working on tokens in the previous version
+                return Sources.TOKENS
+            # others that allow both sources were working on documents
+            return STATISTICS_DEFAULT_SOURCES[idx][0]
+
+        if version < 2:
+            if "active_rules" in settings:
+                new_rules = [(r, v, def_source(r)) for r, v in settings["active_rules"]]
+                settings["active_rules"] = new_rules
+
 
 if __name__ == "__main__":
     WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts"))
diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py
index ad1820413..e3082e406 100644
--- a/orangecontrib/text/widgets/tests/test_owstatistics.py
+++ b/orangecontrib/text/widgets/tests/test_owstatistics.py
@@ -5,11 +5,18 @@
 
 from Orange.data import Domain, StringVariable
 from Orange.widgets.tests.base import WidgetTest
+from Orange.widgets.tests.utils import simulate
 from orangecontrib.text import Corpus
+from orangecontrib.text.preprocess import (
+    PreprocessorList,
+    LowercaseTransformer,
+    RegexpTokenizer,
+    StopwordsFilter,
+)
 from orangecontrib.text.tag import AveragedPerceptronTagger
 from orangecontrib.text.widgets.owstatistics import (
     STATISTICS_NAMES,
-    OWStatistics,
+    OWStatistics, Sources,
 )
 
 
@@ -40,7 +47,9 @@ def _create_simple_data(self) -> None:
             text_features=[text_var],
         )
 
-    def _set_feature(self, feature_name: str, value: str = ""):
+    def _set_feature(
+            self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS
+    ):
         """
         Set statistic which need to be computed by widget. It sets only one
         statistics.
@@ -52,11 +61,15 @@ def _set_feature(self, feature_name: str, value: str = ""):
         value
             If statistic need a value (e.g. prefix) it is passed here.
         """
-        feature_index = STATISTICS_NAMES.index(feature_name)
-        self.widget.active_rules = [(feature_index, value)]
-        self.widget.adjust_n_rule_rows()
-
-    def _compute_features(self, feature_name: str, value: str = "") -> Corpus:
+        simulate.combobox_activate_item(self.widget.statistics_combos[0], feature_name)
+        self.widget.line_edits[0].setText(value)
+        simulate.combobox_activate_item(self.widget.source_combos[0], source)
+        for button in self.widget.remove_buttons[1:]:
+            button.click()
+
+    def _compute_features(
+        self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS
+    ) -> Corpus:
         """
         Send `self.corpus` to widget, set statistic which need bo be computed,
         run the computation, and return widget output.
@@ -74,7 +87,7 @@ def _compute_features(self, feature_name: str, value: str = "") -> Corpus:
         """
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
         self.wait_until_finished()
-        self._set_feature(feature_name, value)
+        self._set_feature(feature_name, value, source)
         self.widget.apply()
         self.wait_until_finished()
         res = self.get_output(self.widget.Outputs.corpus)
@@ -101,7 +114,10 @@ def test_words_count(self):
 
     def test_characters_count(self):
         """ Test characters count statistic """
-        data = self._compute_features("Character count")
+        data = self._compute_features("Character count", source=Sources.DOCUMENTS)
+        np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51])
+
+        data = self._compute_features("Character count", source=Sources.TOKENS)
         np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51])
 
         self.send_signal(self.widget.Inputs.corpus, None)
@@ -109,7 +125,7 @@ def test_characters_count(self):
 
     def test_n_gram_count(self):
         """ Test n-grams count statistic """
-        data = self._compute_features("N-gram count")
+        data = self._compute_features("N-gram count", source=Sources.TOKENS)
         np.testing.assert_array_equal(data.X.flatten(), [10, 12, 13, 12])
 
         self.send_signal(self.widget.Inputs.corpus, None)
@@ -161,16 +177,16 @@ def test_consonants_count(self):
 
     def test_per_cent_unique_words(self):
         """ Test per-cent unique words statistic """
-        data = self._compute_features("Per cent unique words")
+        data = self._compute_features("Per cent unique terms", source=Sources.TOKENS)
         np.testing.assert_array_almost_equal(
-            data.X.flatten(), [1, 1, 0.909091, 1]
+            data.X.flatten(), [1, 1, 0.84615, 1], decimal=5
         )
 
         with self.corpus.unlocked():
-            self.corpus[1][-1] = ""
-        data = self._compute_features("Per cent unique words")
+            self.corpus[1][-1] = " "
+        data = self._compute_features("Per cent unique terms", source=Sources.TOKENS)
         np.testing.assert_array_almost_equal(
-            data.X.flatten(), [1, np.nan, 0.909091, 1]
+            data.X.flatten(), [1, np.nan, 0.84615, 1], decimal=5
         )
         
         self.send_signal(self.widget.Inputs.corpus, None)
@@ -178,10 +194,10 @@ def test_per_cent_unique_words(self):
 
     def test_starts_with(self):
         """ Test starts with count statistic """
-        data = self._compute_features("Starts with", "a")
+        data = self._compute_features("Starts with", "a", Sources.TOKENS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 2, 2])
 
-        data = self._compute_features("Starts with", "ap")
+        data = self._compute_features("Starts with", "ap", Sources.TOKENS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
 
         self.send_signal(self.widget.Inputs.corpus, None)
@@ -189,10 +205,10 @@ def test_starts_with(self):
 
     def test_ends_with(self):
         """ Test ends with count statistic """
-        data = self._compute_features("Ends with", "t")
+        data = self._compute_features("Ends with", "t", Sources.TOKENS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [3, 3, 1, 2])
 
-        data = self._compute_features("Ends with", "et")
+        data = self._compute_features("Ends with", "et", Sources.TOKENS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 0, 0])
 
         self.send_signal(self.widget.Inputs.corpus, None)
@@ -200,28 +216,50 @@ def test_ends_with(self):
 
     def test_contains(self):
         """ Test contains count statistic """
-        data = self._compute_features("Contains", "t")
+        data = self._compute_features("Contains", "t", Sources.DOCUMENTS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9])
 
-        data = self._compute_features("Contains", "et")
+        data = self._compute_features("Contains", "et", Sources.DOCUMENTS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [2, 1, 0, 0])
 
-        data = self._compute_features("Contains", "is")
+        data = self._compute_features("Contains", "is", Sources.DOCUMENTS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
 
+        data = self._compute_features("Contains", "t", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9])
+
+        data = self._compute_features("Contains", " ", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
         self.send_signal(self.widget.Inputs.corpus, None)
         self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
 
     def test_regex(self):
         """ Test regex statistic """
-        # words that contains digit
-        data = self._compute_features("Regex", "\w*\d\w*")
+        # words that contain digit
+        data = self._compute_features("Regex", r"\w*\d\w*", Sources.DOCUMENTS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
 
-        # words that contains digit
-        data = self._compute_features("Regex", "\w*is\w*")
+        # words that contain is
+        data = self._compute_features("Regex", r"\w*is\w*", Sources.DOCUMENTS)
         np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
 
+        # count specific n-gram
+        data = self._compute_features("Regex", r"ipsum\ dolor", Sources.DOCUMENTS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [1, 0, 0, 0])
+
+        # words that contain digit
+        data = self._compute_features("Regex", r"\w*\d\w*", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
+
+        # words that contain is
+        data = self._compute_features("Regex", r"\w*is\w*", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+
+        # count specific n-gram
+        data = self._compute_features("Regex", r"ipsum\ dolor", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
         self.send_signal(self.widget.Inputs.corpus, None)
         self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
 
@@ -232,7 +270,7 @@ def test_pos(self):
         - test with corpus that has pos tags
         """
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self._set_feature("POS tag", "NN")
+        self._set_feature("POS tag", "NN", Sources.TOKENS)
         self.widget.apply()
         self.wait_until_finished()
         res = self.get_output(self.widget.Outputs.corpus)
@@ -243,7 +281,7 @@ def test_pos(self):
         result = tagger(self.corpus)
 
         self.send_signal(self.widget.Inputs.corpus, result)
-        self._set_feature("POS tag", "NN")
+        self._set_feature("POS tag", "NN", Sources.TOKENS)
         self.widget.apply()
         self.wait_until_finished()
         res = self.get_output(self.widget.Outputs.corpus)
@@ -258,7 +296,7 @@ def test_yule(self):
         - test with corpus that has pos tags
         """
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self._set_feature("Yule's I")
+        self._set_feature("Yule's I", source=Sources.TOKENS)
         self.widget.apply()
         self.wait_until_finished()
         res = self.get_output(self.widget.Outputs.corpus)
@@ -271,7 +309,7 @@ def test_yule(self):
         result = tagger(self.corpus)
 
         self.send_signal(self.widget.Inputs.corpus, result)
-        self._set_feature("Yule's I")
+        self._set_feature("Yule's I", source=Sources.TOKENS)
         self.widget.apply()
         self.wait_until_finished()
         res = self.get_output(self.widget.Outputs.corpus)
@@ -287,7 +325,7 @@ def test_lix(self):
         with self.corpus.unlocked():
             self.corpus[1][-1] = "simple. simple."
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self._set_feature("LIX index")
+        self._set_feature("LIX index", source=Sources.TOKENS)
         self.widget.apply()
         self.wait_until_finished()
         res = self.get_output(self.widget.Outputs.corpus)
@@ -295,6 +333,40 @@ def test_lix(self):
         # the second document will have lower complexity than the first one
         self.assertLess(res[1][0], res[0][0])
 
+    def test_stats_different_preprocessing(self):
+        pp = [LowercaseTransformer(), RegexpTokenizer(), StopwordsFilter(language="en")]
+        pp = PreprocessorList(pp)
+        self.corpus = pp(self.corpus)
+
+        data = self._compute_features("Character count", "", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [47, 44, 46, 51])
+
+        data = self._compute_features("N-gram count", "", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [8, 9, 9, 9])
+
+        data = self._compute_features("Per cent unique terms", "", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 1, 1])
+
+        # none start with the capital because of Lowercase preprocessor
+        data = self._compute_features("Starts with", "L", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
+        data = self._compute_features("Starts with", "a", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 0, 2])
+
+        data = self._compute_features("Ends with", "a", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 1, 2, 1])
+
+        # non contain comma since we use RegexP preprocessor
+        data = self._compute_features("Contains", ",", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
+        data = self._compute_features("Contains", "a", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [2, 2, 6, 5])
+
+        data = self._compute_features("Regex", "{e", Sources.TOKENS)
+        np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
     def test_statistics_combination(self):
         """
         Testing three statistics at same time and see if column concatenated
@@ -306,9 +378,9 @@ def test_statistics_combination(self):
         starts_with_index = STATISTICS_NAMES.index("Starts with")
         capital_counts_index = STATISTICS_NAMES.index("Capital letter count")
         self.widget.active_rules = [
-            (wc_index, ""),
-            (starts_with_index, "a"),
-            (capital_counts_index, ""),
+            (wc_index, "", Sources.DOCUMENTS),
+            (starts_with_index, "a", Sources.TOKENS),
+            (capital_counts_index, "", Sources.DOCUMENTS),
         ]
         self.widget.adjust_n_rule_rows()
 
@@ -333,43 +405,44 @@ def test_dictionary_statistics(self):
         """
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
 
-        self.widget.active_rules = [
-            (1, ""),
-        ]
+        self.widget.active_rules = [(1, "", Sources.DOCUMENTS)]
         self.widget.adjust_n_rule_rows()
         self.widget.apply()
         self.wait_until_finished()
 
-        self.assertListEqual([(1, None)], list(self.widget.result_dict.keys()))
+        expected = [(1, "", Sources.DOCUMENTS)]
+        self.assertListEqual(expected, list(self.widget.result_dict.keys()))
 
-        self.widget.active_rules = [(1, ""), (2, "")]
+        self.widget.active_rules = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)]
         self.widget.adjust_n_rule_rows()
         self.widget.apply()
         self.wait_until_finished()
 
-        self.assertListEqual(
-            [(1, ""), (2, None)], list(self.widget.result_dict.keys())
-        )
+        expected = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)]
+        self.assertListEqual(expected, list(self.widget.result_dict.keys()))
 
-        self.widget.active_rules = [(2, "")]
+        self.widget.active_rules = [(2, "", Sources.TOKENS)]
         self.widget.adjust_n_rule_rows()
         self.widget.apply()
         self.wait_until_finished()
 
-        self.assertListEqual([(2, None)], list(self.widget.result_dict.keys()))
+        expected = [(2, "", Sources.TOKENS)]
+        self.assertListEqual(expected, list(self.widget.result_dict.keys()))
 
         # dict should empty on new data
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
         self.assertListEqual([], list(self.widget.result_dict.keys()))
 
     def test_settings(self):
-        """ Test whether context correctly restore rules """
-        rules = [(0, ""), (1, ""), (2, None)]
+        """Test whether context correctly restore rules"""
+        doc, tk = Sources.DOCUMENTS, Sources.TOKENS
+        rules = [(0, "", doc), (1, "", doc), (2, "", tk)]
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
         self.widget.active_rules = rules[:]
 
         self.send_signal(self.widget.Inputs.corpus, self.book_data)
-        self.assertListEqual([(0, ""), (1, ""), (2, None)], self.widget.active_rules)
+        expected = [(0, "", doc), (1, "", doc), (2, "", tk)]
+        self.assertListEqual(expected, self.widget.active_rules)
 
     def test_compute_values(self):
         """ Test compute values on new data """
@@ -401,13 +474,13 @@ def test_add_row(self):
             if x.text() == "+"
         ][0]
         add_button.click()
-        self.assertListEqual([(0, "")], self.widget.active_rules)
+        self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules)
 
     def test_remove_row(self):
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self.widget.active_rules = [(0, "")]
+        self.widget.active_rules = [(0, "", Sources.DOCUMENTS)]
         self.widget.adjust_n_rule_rows()
-        self.assertListEqual([(0, "")], self.widget.active_rules)
+        self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules)
 
         remove_button = [
             x
@@ -417,6 +490,32 @@ def test_remove_row(self):
         remove_button.click()
         self.assertListEqual([], self.widget.active_rules)
 
+    def test_migrate_settings(self):
+        vals = [""] * 6 + ["a,e", "b,c", "", "a", "b", "c", r"\w*is", "NN,VV", "", ""]
+        settings = {"__version__": 1, "active_rules": list(zip(range(17), vals))}
+        widget = self.create_widget(OWStatistics, stored_settings=settings)
+        self.send_signal(self.widget.Inputs.corpus, self.corpus, widget=widget)
+
+        expected = [
+            (0, "", Sources.DOCUMENTS),
+            (1, "", Sources.DOCUMENTS),
+            (2, "", Sources.TOKENS),
+            (3, "", Sources.DOCUMENTS),
+            (4, "", Sources.DOCUMENTS),
+            (5, "", Sources.DOCUMENTS),
+            (6, "a,e", Sources.DOCUMENTS),
+            (7, "b,c", Sources.DOCUMENTS),
+            (8, "", Sources.TOKENS),
+            (9, "a", Sources.TOKENS),
+            (10, "b", Sources.TOKENS),
+            (11, "c", Sources.DOCUMENTS),
+            (12, r"\w*is", Sources.DOCUMENTS),
+            (13, "NN,VV", Sources.TOKENS),
+            (14, "", Sources.TOKENS),
+            (15, "", Sources.TOKENS),
+        ]
+        self.assertListEqual(expected, widget.active_rules)
+
 
 if __name__ == "__main__":
     unittest.main()