From 94d3a97a99f741849bc91b2e984c7b527c6e2902 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Tue, 24 Oct 2023 11:24:16 +0200
Subject: [PATCH] Statistis - Select statistic computation source
---
orangecontrib/text/widgets/owstatistics.py | 277 ++++++++++--------
.../text/widgets/tests/test_owstatistics.py | 199 +++++++++----
2 files changed, 309 insertions(+), 167 deletions(-)
diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py
index f465ee3ac..eb4296d8f 100644
--- a/orangecontrib/text/widgets/owstatistics.py
+++ b/orangecontrib/text/widgets/owstatistics.py
@@ -3,7 +3,7 @@
from copy import copy
from itertools import groupby
from string import punctuation
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple, Union, Iterator, Dict
import numpy as np
from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit, QSizePolicy
@@ -14,19 +14,18 @@
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Input, Output, OWWidget
from nltk import tokenize
+from orangecanvas.gui.utils import disconnected
from orangewidget.widget import Msg
from orangecontrib.text import Corpus
-# those functions are implemented here since they are used in more statistics
-from orangecontrib.text.preprocess import (
- LowercaseTransformer,
- RegexpTokenizer,
- PreprocessorList
-)
+class Sources:
+ DOCUMENTS = "Documents"
+ TOKENS = "Preprocessed tokens" # tokens or ngrams - depending on statistic
-def num_words(document: str, callback: Callable) -> int:
+
+def num_words(document: Union[str, List], callback: Callable) -> int:
"""
Return number of words in document-string. Word is every entity divided by
space, tab, newline.
@@ -35,11 +34,13 @@ def num_words(document: str, callback: Callable) -> int:
return len(document.split())
-def char_count(document: str, callback: Callable) -> int:
+def char_count(document: Union[str, List], callback: Callable) -> int:
"""
Count number of alpha-numerical in document/string.
"""
callback()
+ if isinstance(document, List):
+ document = "".join(document)
return sum(c.isalnum() for c in document)
@@ -52,37 +53,32 @@ def digit_count(document: str, callback: Callable) -> int:
def count_appearances(
- document: str, characters: List[str], callback: Callable
+ document: Union[str, List], characters: List[str], callback: Callable
) -> int:
"""
Count number of appearances of chars from `characters` list.
"""
callback()
# I think it supports the majority of main languages
- # Y can be vowel too sometimes - it is not possible to distinguish
- return sum(document.lower().count(c) for c in characters)
+ # Y can be vo wel too sometimes - it is not possible to distinguish
+ if isinstance(document, str):
+ return sum(document.lower().count(c) for c in characters)
+ else:
+ return sum(d.lower().count(c) for c in characters for d in document)
-def preprocess_only_words(corpus: Corpus) -> Corpus:
+def get_source(corpus: Corpus, source: str) -> Union[List[str], Iterator[List[str]]]:
"""
- Apply the preprocessor that splits words, transforms them to lower case
- (and removes punctuations).
-
- Parameters
- ----------
- corpus
- Corpus on which the preprocessor will be applied.
-
- Returns
- -------
- Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
+ Extract source from corpus according to source variable:
+ - if source == Sources.DOCUMENTS return documents
+ - if source == Sources.TOKENS return ngrams
"""
- p = PreprocessorList(
- [LowercaseTransformer(),
- # by default regexp keeps only words (no punctuations, no spaces)
- RegexpTokenizer()]
- )
- return p(corpus)
+ if source == "Documents":
+ return corpus.documents
+ elif source == "Preprocessed tokens":
+ return corpus.ngrams
+ else:
+ raise ValueError(f"Wrong source {source}")
# every statistic returns a np.ndarray with statistics
@@ -91,38 +87,34 @@ def preprocess_only_words(corpus: Corpus) -> Corpus:
def words_count(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count number of words in each document.
"""
- corpus = preprocess_only_words(corpus)
+ assert source == Sources.DOCUMENTS
# np.c_ makes column vector (ndarray) out of the list
# [1, 2, 3] -> [[1], [2], [3]]
- return (
- np.c_[[num_words(d, callback) for d in corpus.documents]],
- ["Word count"],
- )
+ return np.c_[[num_words(d, callback) for d in corpus.documents]], ["Word count"]
def characters_count(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count number of characters without spaces, newlines, tabs, ...
"""
- return (
- np.c_[[char_count(d, callback) for d in corpus.documents]],
- ["Character count"],
- )
+ source = get_source(corpus, source)
+ return np.c_[[char_count(d, callback) for d in source]], ["Character count"]
def n_gram_count(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count number of n-grams in every document
"""
+ assert source == Sources.TOKENS
def ng_count(n_gram: List[str]):
callback()
@@ -132,11 +124,12 @@ def ng_count(n_gram: List[str]):
def average_word_len(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Computes word density as: word count / character count + 1
"""
+ assert source == Sources.DOCUMENTS
return (
np.c_[
[
@@ -149,11 +142,12 @@ def average_word_len(
def punctuation_count(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count number of punctuation signs
"""
+ assert source == Sources.DOCUMENTS
def num_punctuation(document: str):
callback()
@@ -166,11 +160,12 @@ def num_punctuation(document: str):
def capital_count(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count number of capital letters in documents
"""
+ assert source == Sources.DOCUMENTS
def num_capitals(document: str):
callback()
@@ -183,11 +178,13 @@ def num_capitals(document: str):
def vowel_count(
- corpus: Corpus, vowels: str, callback: Callable
+ corpus: Corpus, vowels: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count number of vowels in documents
"""
+ assert source == Sources.DOCUMENTS
+
# comma separated string of vowels to list
vowels = [v.strip() for v in vowels.split(",")]
return (
@@ -199,12 +196,14 @@ def vowel_count(
def consonant_count(
- corpus: Corpus, consonants: str, callback: Callable
+ corpus: Corpus, consonants: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count number of consonants in documents. Consonants are all alnum
characters except vowels and numbers
"""
+ assert source == Sources.DOCUMENTS
+
# comma separated string of consonants to list
consonants = [v.strip() for v in consonants.split(",")]
return (
@@ -219,12 +218,12 @@ def consonant_count(
def per_cent_unique_words(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Ratio between unique words count and all words count
"""
- corpus = preprocess_only_words(corpus)
+ assert source == Sources.TOKENS
def perc_unique(tokens: str):
callback()
@@ -232,83 +231,84 @@ def perc_unique(tokens: str):
return np.nan
return len(set(tokens)) / len(tokens)
- return np.c_[list(map(perc_unique, corpus.tokens))], ["% unique words"]
+ return np.c_[list(map(perc_unique, corpus.ngrams))], ["% unique words"]
def starts_with(
- corpus: Corpus, prefix: str, callback: Callable
+ corpus: Corpus, prefix: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Number of words that starts with the string in `prefix`.
"""
- corpus = preprocess_only_words(corpus)
+ assert source == Sources.TOKENS
def number_starts_with(tokens: List[str]):
callback()
return sum(t.startswith(prefix) for t in tokens)
return (
- np.c_[list(map(number_starts_with, corpus.tokens))],
+ np.c_[list(map(number_starts_with, corpus.ngrams))],
[f"Starts with {prefix}"],
)
def ends_with(
- corpus: Corpus, postfix: str, callback: Callable
+ corpus: Corpus, postfix: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Number of words that ends with the string in `postfix`.
"""
- corpus = preprocess_only_words(corpus)
+ assert source == Sources.TOKENS
def number_ends_with(tokens: List[str]):
callback()
return sum(t.endswith(postfix) for t in tokens)
return (
- np.c_[list(map(number_ends_with, corpus.tokens))],
+ np.c_[list(map(number_ends_with, corpus.ngrams))],
[f"Ends with {postfix}"],
)
def contains(
- corpus: Corpus, text: str, callback: Callable
+ corpus: Corpus, text: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Number of words that contains string in `text`.
"""
+ source = get_source(corpus, source)
return (
- np.c_[
- [count_appearances(d, [text], callback) for d in corpus.documents]
- ],
+ np.c_[[count_appearances(d, [text], callback) for d in source]],
[f"Contains {text}"],
)
def regex(
- corpus: Corpus, expression: str, callback: Callable
+ corpus: Corpus, expression: str, source: str, callback: Callable
) -> Tuple[np.ndarray, List[str]]:
"""
Count occurrences of pattern in `expression`.
"""
pattern = re.compile(expression)
- def number_regex(tokens: List[str]):
+ def regex_matches(text: Union[str, List]):
callback()
- return sum(bool(pattern.match(t)) for t in tokens)
+ if isinstance(text, str):
+ return len(re.findall(pattern, text))
+ else:
+ return sum(len(re.findall(pattern, ngram)) for ngram in text)
- return (
- np.c_[list(map(number_regex, corpus.tokens))],
- [f"Regex {expression}"],
- )
+ source = get_source(corpus, source)
+ return np.c_[list(map(regex_matches, source))], [f"Regex {expression}"]
def pos_tags(
- corpus: Corpus, pos_tags: str, callback: Callable
+ corpus: Corpus, pos_tags: str, source: str, callback: Callable
) -> Optional[Tuple[np.ndarray, List[str]]]:
"""
Count number of specified pos tags in corpus
"""
+ assert source == Sources.TOKENS
p_tags = [v.strip().lower() for v in pos_tags.split(",")]
def cust_count(tags):
@@ -325,7 +325,7 @@ def cust_count(tags):
def yule(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Optional[Tuple[np.ndarray, List[str]]]:
"""
Yule's I measure: higher number is higher diversity - richer vocabulary
@@ -333,6 +333,7 @@ def yule(
Mathematical Proceedings of the Cambridge Philosophical Society, 42(2), B1-B2.
doi:10.1017/S0305004100022799
"""
+ assert source == Sources.TOKENS
if corpus.pos_tags is None:
return None
@@ -354,13 +355,13 @@ def yules_i(tags):
def lix(
- corpus: Corpus, _: str, callback: Callable
+ corpus: Corpus, _: str, source: str, callback: Callable
) -> Optional[Tuple[np.ndarray, List[str]]]:
"""
Readability index LIX
https://en.wikipedia.org/wiki/Lix_(readability_test)
"""
- corpus = preprocess_only_words(corpus)
+ assert source == Sources.TOKENS
tokenizer = tokenize.PunktSentenceTokenizer()
def lix_index(document, tokens):
@@ -393,18 +394,21 @@ class ComputeValue:
pattern
Some statistics need additional parameter with the pattern
(e.g. starts with), for others it is set to empty string.
+ source
+ Part of the corpus used for computation: either tokens/ngrams or whole documents
"""
- def __init__(self, function: Callable, pattern: str) -> None:
+ def __init__(self, function: Callable, pattern: str, source: str) -> None:
self.function = function
self.pattern = pattern
+ self.source = source
def __call__(self, data: Corpus) -> np.ndarray:
"""
This function compute values on new table.
"""
# lambda is added as a placeholder for a callback.
- return self.function(data, self.pattern, lambda: True)[0]
+ return self.function(data, self.pattern, self.source, lambda: True)[0]
def __eq__(self, other):
return self.function == other.function and self.pattern == other.pattern
@@ -419,30 +423,32 @@ def __hash__(self):
STATISTICS = [
# (name of the statistics, function to compute, default value)
# if default value is None - text box is not required
- ("Word count", words_count, None),
- ("Character count", characters_count, None),
- ("N-gram count", n_gram_count, None),
- ("Average word length", average_word_len, None),
- ("Punctuation count", punctuation_count, None),
- ("Capital letter count", capital_count, None),
- ("Vowel count", vowel_count, "a,e,i,o,u"),
+ ("Word count", words_count, None, (Sources.DOCUMENTS,)),
+ ("Character count", characters_count, None, (Sources.DOCUMENTS, Sources.TOKENS)),
+ ("N-gram count", n_gram_count, None, (Sources.TOKENS,)),
+ ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)),
+ ("Punctuation count", punctuation_count, None, (Sources.DOCUMENTS,)),
+ ("Capital letter count", capital_count, None, (Sources.DOCUMENTS,)),
+ ("Vowel count", vowel_count, "a,e,i,o,u", (Sources.DOCUMENTS,)),
(
"Consonant count",
consonant_count,
"b,c,d,f,g,h,j,k,l,m,n,p,q,r,s,t,v,w,x,y,z",
+ (Sources.DOCUMENTS,),
),
- ("Per cent unique words", per_cent_unique_words, None),
- ("Starts with", starts_with, ""),
- ("Ends with", ends_with, ""),
- ("Contains", contains, ""),
- ("Regex", regex, ""),
- ("POS tag", pos_tags, "NN,VV,JJ"),
- ("Yule's I", yule, None),
- ("LIX index", lix, None),
+ ("Per cent unique terms", per_cent_unique_words, None, (Sources.TOKENS,)),
+ ("Starts with", starts_with, "", (Sources.TOKENS,)),
+ ("Ends with", ends_with, "", (Sources.TOKENS,)),
+ ("Contains", contains, "", (Sources.DOCUMENTS, Sources.TOKENS)),
+ ("Regex", regex, "", (Sources.DOCUMENTS, Sources.TOKENS)),
+ ("POS tag", pos_tags, "NN,VV,JJ", (Sources.TOKENS,)),
+ ("Yule's I", yule, None, (Sources.TOKENS,)),
+ ("LIX index", lix, None, (Sources.TOKENS,)),
]
STATISTICS_NAMES = list(list(zip(*STATISTICS))[0])
STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1])
STATISTICS_DEFAULT_VALUE = list(list(zip(*STATISTICS))[2])
+STATISTICS_DEFAULT_SOURCES = list(list(zip(*STATISTICS))[3])
def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
@@ -466,12 +472,12 @@ def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
def advance():
state.set_progress_value(next(tick_values))
- for s, patern in statistics:
+ for s, patern, source in statistics:
fun = STATISTICS_FUNCTIONS[s]
- result = fun(corpus, patern, advance)
+ result = fun(corpus, patern, source, advance)
if result is not None:
- result = result + (ComputeValue(fun, patern),)
- state.set_partial_result((s, patern, result))
+ result = result + (ComputeValue(fun, patern, source),)
+ state.set_partial_result((s, patern, source, result))
class OWStatistics(OWWidget, ConcurrentWidgetMixin):
@@ -491,12 +497,14 @@ class Warning(OWWidget.Warning):
"{} statistics cannot be computed and is omitted from results."
)
+ # todo: update settings version and migration
want_main_area = False
mainArea_width_height_ratio = None
- # settings
- default_rules = [(0, ""), (1, "")] # rules used to reset the active rules
- active_rules: List[Tuple[int, str]] = Setting(default_rules[:])
+ settings_version = 2
+ # rules used to reset the active rules
+ default_rules = [(0, "", STATISTICS[0][-1][0]), (1, "", STATISTICS[0][-1][0])]
+ active_rules: List[Tuple[int, str, str]] = Setting(default_rules[:])
# rules active at time of apply clicked
applied_rules: Optional[List[Tuple[int, str]]] = None
@@ -507,12 +515,14 @@ def __init__(self) -> None:
ConcurrentWidgetMixin.__init__(self)
self.corpus = None
- # the list with combos from the widget
- self.combos = []
+ # the list with combos for selecting statistics from the widget
+ self.statistics_combos = []
# the list with line edits from the widget
self.line_edits = []
# the list of buttons in front of controls that removes them
self.remove_buttons = []
+ # the list with combos for selecting on what statistics computes
+ self.source_combos = []
self._init_controls()
@@ -542,6 +552,7 @@ def _init_statistics_box(self) -> None:
grid.setColumnStretch(2, 100)
grid.addWidget(QLabel("Feature"), 0, 1)
grid.addWidget(QLabel("Pattern"), 0, 2)
+ grid.addWidget(QLabel("Compute for"), 0, 3)
gui.button(
box,
@@ -562,7 +573,7 @@ def adjust_n_rule_rows(self) -> None:
"""
def _add_line():
- n_lines = len(self.combos) + 1
+ n_lines = len(self.statistics_combos) + 1
# add delete symbol
button = gui.button(
@@ -577,23 +588,29 @@ def _add_line():
combo.addItems(STATISTICS_NAMES)
combo.currentIndexChanged.connect(self._sync_edit_combo)
self.rules_grid.addWidget(combo, n_lines, 1)
- self.combos.append(combo)
+ self.statistics_combos.append(combo)
- # add line edit for patern
+ # add line edit for pattern
line_edit = QLineEdit()
self.rules_grid.addWidget(line_edit, n_lines, 2)
line_edit.textChanged.connect(self._sync_edit_line)
self.line_edits.append(line_edit)
+ # add statistics type dropdown
+ combo = QComboBox()
+ combo.currentIndexChanged.connect(self._sync_edit_source_combo)
+ self.rules_grid.addWidget(combo, n_lines, 3)
+ self.source_combos.append(combo)
+
def _remove_line():
- self.combos.pop().deleteLater()
+ self.statistics_combos.pop().deleteLater()
self.line_edits.pop().deleteLater()
+ self.source_combos.pop().deleteLater()
self.remove_buttons.pop().deleteLater()
def _fix_tab_order():
- # TODO: write it differently - check create class
- for i, (r, c, l) in enumerate(
- zip(self.active_rules, self.combos, self.line_edits)
+ for i, (r, c, l, s) in enumerate(
+ zip(self.active_rules, self.statistics_combos, self.line_edits, self.source_combos)
):
c.setCurrentIndex(r[0]) # update combo
l.setText(r[1]) # update line edit
@@ -601,17 +618,21 @@ def _fix_tab_order():
l.setVisible(True)
else:
l.setVisible(False)
+ with disconnected(s.currentIndexChanged, self._sync_edit_source_combo):
+ s.clear()
+ s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]])
+ s.setCurrentText(r[2])
n = len(self.active_rules)
- while n > len(self.combos):
+ while n > len(self.statistics_combos):
_add_line()
- while len(self.combos) > n:
+ while len(self.statistics_combos) > n:
_remove_line()
_fix_tab_order()
def _add_row(self) -> None:
""" Add a new row to the statistic box """
- self.active_rules.append((0, ""))
+ self.active_rules.append((0, "", STATISTICS_DEFAULT_SOURCES[0][0]))
self.adjust_n_rule_rows()
def _remove_row(self) -> None:
@@ -623,20 +644,27 @@ def _remove_row(self) -> None:
def _sync_edit_combo(self) -> None:
""" Update rules when combo value changed """
combo = self.sender()
- edit_index = self.combos.index(combo)
+ edit_index = self.statistics_combos.index(combo)
selected_i = combo.currentIndex()
- default_value = STATISTICS_DEFAULT_VALUE[selected_i]
- self.active_rules[edit_index] = (selected_i, default_value)
+ default_value = STATISTICS_DEFAULT_VALUE[selected_i] or ""
+ default_source = STATISTICS_DEFAULT_SOURCES[selected_i][0]
+ self.active_rules[edit_index] = (selected_i, default_value, default_source)
self.adjust_n_rule_rows()
def _sync_edit_line(self) -> None:
""" Update rules when line edit value changed """
line_edit = self.sender()
edit_index = self.line_edits.index(line_edit)
- self.active_rules[edit_index] = (
- self.active_rules[edit_index][0],
- line_edit.text(),
- )
+ arules = self.active_rules[edit_index]
+ self.active_rules[edit_index] = (arules[0], line_edit.text(), arules[2])
+
+ def _sync_edit_source_combo(self) -> None:
+ """ Update rules when source value change """
+ combo = self.sender()
+ edit_index = self.source_combos.index(combo)
+ value = combo.currentText()
+ arules = self.active_rules[edit_index]
+ self.active_rules[edit_index] = (arules[0], arules[1], value)
@Inputs.corpus
def set_data(self, corpus) -> None:
@@ -666,10 +694,10 @@ def on_exception(self, exception: Exception) -> None:
raise exception
def on_partial_result(
- self, result: Tuple[int, str, Tuple[np.ndarray, List[str], Callable]]
+ self, result: Tuple[int, str, str, Tuple[np.ndarray, List[str], Callable]]
) -> None:
- statistic, patern, result = result
- self.result_dict[(statistic, patern)] = result
+ statistic, patern, source, result = result
+ self.result_dict[(statistic, patern, source)] = result
def on_done(self, result: None) -> None:
# join results
@@ -707,6 +735,21 @@ def output_results(self) -> None:
)
self.Outputs.corpus.send(new_corpus)
+ @classmethod
+ def migrate_settings(cls, settings: Dict, version: int):
+ def def_source(idx):
+ """Return source that behaviour is the most similar to previous version"""
+ if STATISTICS_NAMES[idx] == "Regex":
+ # regex was working on tokens in the previous version
+ return Sources.TOKENS
+ # others that allow both sources were working on documents
+ return STATISTICS_DEFAULT_SOURCES[idx][0]
+
+ if version < 2:
+ if "active_rules" in settings:
+ new_rules = [(r, v, def_source(r)) for r, v in settings["active_rules"]]
+ settings["active_rules"] = new_rules
+
if __name__ == "__main__":
WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts"))
diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py
index ad1820413..e3082e406 100644
--- a/orangecontrib/text/widgets/tests/test_owstatistics.py
+++ b/orangecontrib/text/widgets/tests/test_owstatistics.py
@@ -5,11 +5,18 @@
from Orange.data import Domain, StringVariable
from Orange.widgets.tests.base import WidgetTest
+from Orange.widgets.tests.utils import simulate
from orangecontrib.text import Corpus
+from orangecontrib.text.preprocess import (
+ PreprocessorList,
+ LowercaseTransformer,
+ RegexpTokenizer,
+ StopwordsFilter,
+)
from orangecontrib.text.tag import AveragedPerceptronTagger
from orangecontrib.text.widgets.owstatistics import (
STATISTICS_NAMES,
- OWStatistics,
+ OWStatistics, Sources,
)
@@ -40,7 +47,9 @@ def _create_simple_data(self) -> None:
text_features=[text_var],
)
- def _set_feature(self, feature_name: str, value: str = ""):
+ def _set_feature(
+ self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS
+ ):
"""
Set statistic which need to be computed by widget. It sets only one
statistics.
@@ -52,11 +61,15 @@ def _set_feature(self, feature_name: str, value: str = ""):
value
If statistic need a value (e.g. prefix) it is passed here.
"""
- feature_index = STATISTICS_NAMES.index(feature_name)
- self.widget.active_rules = [(feature_index, value)]
- self.widget.adjust_n_rule_rows()
-
- def _compute_features(self, feature_name: str, value: str = "") -> Corpus:
+ simulate.combobox_activate_item(self.widget.statistics_combos[0], feature_name)
+ self.widget.line_edits[0].setText(value)
+ simulate.combobox_activate_item(self.widget.source_combos[0], source)
+ for button in self.widget.remove_buttons[1:]:
+ button.click()
+
+ def _compute_features(
+ self, feature_name: str, value: str = "", source: str = Sources.DOCUMENTS
+ ) -> Corpus:
"""
Send `self.corpus` to widget, set statistic which need bo be computed,
run the computation, and return widget output.
@@ -74,7 +87,7 @@ def _compute_features(self, feature_name: str, value: str = "") -> Corpus:
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.wait_until_finished()
- self._set_feature(feature_name, value)
+ self._set_feature(feature_name, value, source)
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
@@ -101,7 +114,10 @@ def test_words_count(self):
def test_characters_count(self):
""" Test characters count statistic """
- data = self._compute_features("Character count")
+ data = self._compute_features("Character count", source=Sources.DOCUMENTS)
+ np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51])
+
+ data = self._compute_features("Character count", source=Sources.TOKENS)
np.testing.assert_array_equal(data.X.flatten(), [47, 44, 48, 51])
self.send_signal(self.widget.Inputs.corpus, None)
@@ -109,7 +125,7 @@ def test_characters_count(self):
def test_n_gram_count(self):
""" Test n-grams count statistic """
- data = self._compute_features("N-gram count")
+ data = self._compute_features("N-gram count", source=Sources.TOKENS)
np.testing.assert_array_equal(data.X.flatten(), [10, 12, 13, 12])
self.send_signal(self.widget.Inputs.corpus, None)
@@ -161,16 +177,16 @@ def test_consonants_count(self):
def test_per_cent_unique_words(self):
""" Test per-cent unique words statistic """
- data = self._compute_features("Per cent unique words")
+ data = self._compute_features("Per cent unique terms", source=Sources.TOKENS)
np.testing.assert_array_almost_equal(
- data.X.flatten(), [1, 1, 0.909091, 1]
+ data.X.flatten(), [1, 1, 0.84615, 1], decimal=5
)
with self.corpus.unlocked():
- self.corpus[1][-1] = ""
- data = self._compute_features("Per cent unique words")
+ self.corpus[1][-1] = " "
+ data = self._compute_features("Per cent unique terms", source=Sources.TOKENS)
np.testing.assert_array_almost_equal(
- data.X.flatten(), [1, np.nan, 0.909091, 1]
+ data.X.flatten(), [1, np.nan, 0.84615, 1], decimal=5
)
self.send_signal(self.widget.Inputs.corpus, None)
@@ -178,10 +194,10 @@ def test_per_cent_unique_words(self):
def test_starts_with(self):
""" Test starts with count statistic """
- data = self._compute_features("Starts with", "a")
+ data = self._compute_features("Starts with", "a", Sources.TOKENS)
np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 2, 2])
- data = self._compute_features("Starts with", "ap")
+ data = self._compute_features("Starts with", "ap", Sources.TOKENS)
np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
self.send_signal(self.widget.Inputs.corpus, None)
@@ -189,10 +205,10 @@ def test_starts_with(self):
def test_ends_with(self):
""" Test ends with count statistic """
- data = self._compute_features("Ends with", "t")
+ data = self._compute_features("Ends with", "t", Sources.TOKENS)
np.testing.assert_array_almost_equal(data.X.flatten(), [3, 3, 1, 2])
- data = self._compute_features("Ends with", "et")
+ data = self._compute_features("Ends with", "et", Sources.TOKENS)
np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 0, 0])
self.send_signal(self.widget.Inputs.corpus, None)
@@ -200,28 +216,50 @@ def test_ends_with(self):
def test_contains(self):
""" Test contains count statistic """
- data = self._compute_features("Contains", "t")
+ data = self._compute_features("Contains", "t", Sources.DOCUMENTS)
np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9])
- data = self._compute_features("Contains", "et")
+ data = self._compute_features("Contains", "et", Sources.DOCUMENTS)
np.testing.assert_array_almost_equal(data.X.flatten(), [2, 1, 0, 0])
- data = self._compute_features("Contains", "is")
+ data = self._compute_features("Contains", "is", Sources.DOCUMENTS)
np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+ data = self._compute_features("Contains", "t", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [5, 4, 4, 9])
+
+ data = self._compute_features("Contains", " ", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
self.send_signal(self.widget.Inputs.corpus, None)
self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
def test_regex(self):
""" Test regex statistic """
- # words that contains digit
- data = self._compute_features("Regex", "\w*\d\w*")
+ # words that contain digit
+ data = self._compute_features("Regex", r"\w*\d\w*", Sources.DOCUMENTS)
np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
- # words that contains digit
- data = self._compute_features("Regex", "\w*is\w*")
+ # words that contain is
+ data = self._compute_features("Regex", r"\w*is\w*", Sources.DOCUMENTS)
np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+ # count specific n-gram
+ data = self._compute_features("Regex", r"ipsum\ dolor", Sources.DOCUMENTS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [1, 0, 0, 0])
+
+ # words that contain digit
+ data = self._compute_features("Regex", r"\w*\d\w*", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
+
+ # words that contain is
+ data = self._compute_features("Regex", r"\w*is\w*", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
+
+ # count specific n-gram
+ data = self._compute_features("Regex", r"ipsum\ dolor", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
self.send_signal(self.widget.Inputs.corpus, None)
self.assertIsNone(self.get_output(self.widget.Outputs.corpus))
@@ -232,7 +270,7 @@ def test_pos(self):
- test with corpus that has pos tags
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self._set_feature("POS tag", "NN")
+ self._set_feature("POS tag", "NN", Sources.TOKENS)
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
@@ -243,7 +281,7 @@ def test_pos(self):
result = tagger(self.corpus)
self.send_signal(self.widget.Inputs.corpus, result)
- self._set_feature("POS tag", "NN")
+ self._set_feature("POS tag", "NN", Sources.TOKENS)
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
@@ -258,7 +296,7 @@ def test_yule(self):
- test with corpus that has pos tags
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self._set_feature("Yule's I")
+ self._set_feature("Yule's I", source=Sources.TOKENS)
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
@@ -271,7 +309,7 @@ def test_yule(self):
result = tagger(self.corpus)
self.send_signal(self.widget.Inputs.corpus, result)
- self._set_feature("Yule's I")
+ self._set_feature("Yule's I", source=Sources.TOKENS)
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
@@ -287,7 +325,7 @@ def test_lix(self):
with self.corpus.unlocked():
self.corpus[1][-1] = "simple. simple."
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self._set_feature("LIX index")
+ self._set_feature("LIX index", source=Sources.TOKENS)
self.widget.apply()
self.wait_until_finished()
res = self.get_output(self.widget.Outputs.corpus)
@@ -295,6 +333,40 @@ def test_lix(self):
# the second document will have lower complexity than the first one
self.assertLess(res[1][0], res[0][0])
+ def test_stats_different_preprocessing(self):
+ pp = [LowercaseTransformer(), RegexpTokenizer(), StopwordsFilter(language="en")]
+ pp = PreprocessorList(pp)
+ self.corpus = pp(self.corpus)
+
+ data = self._compute_features("Character count", "", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [47, 44, 46, 51])
+
+ data = self._compute_features("N-gram count", "", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [8, 9, 9, 9])
+
+ data = self._compute_features("Per cent unique terms", "", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [1, 1, 1, 1])
+
+ # none start with the capital because of Lowercase preprocessor
+ data = self._compute_features("Starts with", "L", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
+ data = self._compute_features("Starts with", "a", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [2, 0, 0, 2])
+
+ data = self._compute_features("Ends with", "a", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 1, 2, 1])
+
+ # non contain comma since we use RegexP preprocessor
+ data = self._compute_features("Contains", ",", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
+ data = self._compute_features("Contains", "a", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [2, 2, 6, 5])
+
+ data = self._compute_features("Regex", "{e", Sources.TOKENS)
+ np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 0])
+
def test_statistics_combination(self):
"""
Testing three statistics at same time and see if column concatenated
@@ -306,9 +378,9 @@ def test_statistics_combination(self):
starts_with_index = STATISTICS_NAMES.index("Starts with")
capital_counts_index = STATISTICS_NAMES.index("Capital letter count")
self.widget.active_rules = [
- (wc_index, ""),
- (starts_with_index, "a"),
- (capital_counts_index, ""),
+ (wc_index, "", Sources.DOCUMENTS),
+ (starts_with_index, "a", Sources.TOKENS),
+ (capital_counts_index, "", Sources.DOCUMENTS),
]
self.widget.adjust_n_rule_rows()
@@ -333,43 +405,44 @@ def test_dictionary_statistics(self):
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.widget.active_rules = [
- (1, ""),
- ]
+ self.widget.active_rules = [(1, "", Sources.DOCUMENTS)]
self.widget.adjust_n_rule_rows()
self.widget.apply()
self.wait_until_finished()
- self.assertListEqual([(1, None)], list(self.widget.result_dict.keys()))
+ expected = [(1, "", Sources.DOCUMENTS)]
+ self.assertListEqual(expected, list(self.widget.result_dict.keys()))
- self.widget.active_rules = [(1, ""), (2, "")]
+ self.widget.active_rules = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)]
self.widget.adjust_n_rule_rows()
self.widget.apply()
self.wait_until_finished()
- self.assertListEqual(
- [(1, ""), (2, None)], list(self.widget.result_dict.keys())
- )
+ expected = [(1, "", Sources.DOCUMENTS), (2, "", Sources.TOKENS)]
+ self.assertListEqual(expected, list(self.widget.result_dict.keys()))
- self.widget.active_rules = [(2, "")]
+ self.widget.active_rules = [(2, "", Sources.TOKENS)]
self.widget.adjust_n_rule_rows()
self.widget.apply()
self.wait_until_finished()
- self.assertListEqual([(2, None)], list(self.widget.result_dict.keys()))
+ expected = [(2, "", Sources.TOKENS)]
+ self.assertListEqual(expected, list(self.widget.result_dict.keys()))
# dict should empty on new data
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertListEqual([], list(self.widget.result_dict.keys()))
def test_settings(self):
- """ Test whether context correctly restore rules """
- rules = [(0, ""), (1, ""), (2, None)]
+ """Test whether context correctly restore rules"""
+ doc, tk = Sources.DOCUMENTS, Sources.TOKENS
+ rules = [(0, "", doc), (1, "", doc), (2, "", tk)]
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.widget.active_rules = rules[:]
self.send_signal(self.widget.Inputs.corpus, self.book_data)
- self.assertListEqual([(0, ""), (1, ""), (2, None)], self.widget.active_rules)
+ expected = [(0, "", doc), (1, "", doc), (2, "", tk)]
+ self.assertListEqual(expected, self.widget.active_rules)
def test_compute_values(self):
""" Test compute values on new data """
@@ -401,13 +474,13 @@ def test_add_row(self):
if x.text() == "+"
][0]
add_button.click()
- self.assertListEqual([(0, "")], self.widget.active_rules)
+ self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules)
def test_remove_row(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.widget.active_rules = [(0, "")]
+ self.widget.active_rules = [(0, "", Sources.DOCUMENTS)]
self.widget.adjust_n_rule_rows()
- self.assertListEqual([(0, "")], self.widget.active_rules)
+ self.assertListEqual([(0, "", Sources.DOCUMENTS)], self.widget.active_rules)
remove_button = [
x
@@ -417,6 +490,32 @@ def test_remove_row(self):
remove_button.click()
self.assertListEqual([], self.widget.active_rules)
+ def test_migrate_settings(self):
+ vals = [""] * 6 + ["a,e", "b,c", "", "a", "b", "c", r"\w*is", "NN,VV", "", ""]
+ settings = {"__version__": 1, "active_rules": list(zip(range(17), vals))}
+ widget = self.create_widget(OWStatistics, stored_settings=settings)
+ self.send_signal(self.widget.Inputs.corpus, self.corpus, widget=widget)
+
+ expected = [
+ (0, "", Sources.DOCUMENTS),
+ (1, "", Sources.DOCUMENTS),
+ (2, "", Sources.TOKENS),
+ (3, "", Sources.DOCUMENTS),
+ (4, "", Sources.DOCUMENTS),
+ (5, "", Sources.DOCUMENTS),
+ (6, "a,e", Sources.DOCUMENTS),
+ (7, "b,c", Sources.DOCUMENTS),
+ (8, "", Sources.TOKENS),
+ (9, "a", Sources.TOKENS),
+ (10, "b", Sources.TOKENS),
+ (11, "c", Sources.DOCUMENTS),
+ (12, r"\w*is", Sources.DOCUMENTS),
+ (13, "NN,VV", Sources.TOKENS),
+ (14, "", Sources.TOKENS),
+ (15, "", Sources.TOKENS),
+ ]
+ self.assertListEqual(expected, widget.active_rules)
+
if __name__ == "__main__":
unittest.main()