Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Preprocess: Filter by absolute frequency #601

Merged
merged 3 commits into from
Dec 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 126 additions & 35 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from AnyQt.QtCore import Qt, pyqtSignal
from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \
QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \
QDoubleSpinBox, QFileDialog
QDoubleSpinBox, QFileDialog, QAbstractSpinBox
from AnyQt.QtWidgets import QWidget, QPushButton, QSizePolicy, QStyle
from AnyQt.QtGui import QBrush

Expand Down Expand Up @@ -126,6 +126,9 @@ def set_range(self, start: float, end: float):
self._spin_start.setMaximum(end)
self._spin_end.setMinimum(start)

def spins(self) -> Tuple[QAbstractSpinBox, QAbstractSpinBox]:
return self._spin_start, self._spin_end


class RangeDoubleSpins(RangeSpins):
SpinBox = QDoubleSpinBox
Expand All @@ -140,9 +143,6 @@ def __init__(self, start: float, step: float, end: float, minimum: int,
self._spin_start.setMinimumWidth(0)
self._spin_end.setMinimumWidth(0)

def spins(self) -> Tuple[QDoubleSpinBox, QDoubleSpinBox]:
return self._spin_start, self._spin_end


class FileLoader(QWidget):
activated = pyqtSignal()
Expand Down Expand Up @@ -501,18 +501,21 @@ def __repr__(self):


class FilteringModule(MultipleMethodModule):
Stopwords, Lexicon, Regexp, DocFreq, MostFreq = range(5)
Stopwords, Lexicon, Regexp, DocFreq, DummyDocFreq, MostFreq = range(6)
Methods = {Stopwords: StopwordsFilter,
Lexicon: LexiconFilter,
Regexp: RegexpFilter,
DocFreq: FrequencyFilter,
DummyDocFreq: FrequencyFilter,
MostFreq: MostFrequentTokensFilter}
DEFAULT_METHODS = [Stopwords]
DEFAULT_LANG = "English"
DEFAULT_NONE = None
DEFAULT_PATTERN = "\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \
"\’|…|\-|–|—|\$|&|\*|>|<|\/|\[|\]"
DEFAULT_START, DEFAULT_END, MIN, MAX = 0.1, 0.9, 0, 10000
DEFAULT_FREQ_TYPE = 0 # 0 - relative freq, 1 - absolute freq
DEFAULT_REL_START, DEFAULT_REL_END, REL_MIN, REL_MAX = 0.1, 0.9, 0, 1
DEFAULT_ABS_START, DEFAULT_ABS_END, ABS_MIN, ABS_MAX = 1, 10, 0, 10000
DEFAULT_N_TOKEN = 100

def __init__(self, parent=None, **kwargs):
Expand All @@ -521,8 +524,11 @@ def __init__(self, parent=None, **kwargs):
self.__sw_file = self.DEFAULT_NONE
self.__lx_file = self.DEFAULT_NONE
self.__pattern = self.DEFAULT_PATTERN
self.__freq_st = self.DEFAULT_START
self.__freq_en = self.DEFAULT_END
self.__freq_type = self.DEFAULT_FREQ_TYPE
self.__rel_freq_st = self.DEFAULT_REL_START
self.__rel_freq_en = self.DEFAULT_REL_END
self.__abs_freq_st = self.DEFAULT_ABS_START
self.__abs_freq_en = self.DEFAULT_ABS_END
self.__n_token = self.DEFAULT_N_TOKEN
self.__invalidated = False

Expand All @@ -544,11 +550,25 @@ def __init__(self, parent=None, **kwargs):
self.__edit = ValidatedLineEdit(self.__pattern, validator)
self.__edit.editingFinished.connect(self.__edit_finished)

self.__range_spins = RangeDoubleSpins(
self.__freq_st, 0.05, self.__freq_en, self.MIN,
self.MAX, self.__set_freq_start, self.__set_freq_end,
self.__spins_edited
rel_freq_rb = QRadioButton("Relative:")
abs_freq_rb = QRadioButton("Absolute:")
self.__freq_group = group = QButtonGroup(self, exclusive=True)
group.addButton(rel_freq_rb, 0)
group.addButton(abs_freq_rb, 1)
group.buttonClicked.connect(self.__freq_group_clicked)
group.button(self.__freq_type).setChecked(True)

self.__rel_range_spins = RangeDoubleSpins(
self.__rel_freq_st, 0.05, self.__rel_freq_en, self.REL_MIN,
self.REL_MAX, self.__set_rel_freq_start, self.__set_rel_freq_end,
self.__rel_spins_edited
)
self.__abs_range_spins = RangeSpins(
self.__abs_freq_st, 1, self.__abs_freq_en, self.ABS_MIN,
self.ABS_MAX, self.__set_abs_freq_start, self.__set_abs_freq_end,
self.__abs_spins_edited
)

self.__spin_n = QSpinBox(
minimum=1, maximum=10 ** 6, value=self.__n_token)
self.__spin_n.editingFinished.connect(self.__spin_n_edited)
Expand All @@ -564,9 +584,16 @@ def __init__(self, parent=None, **kwargs):
self.layout().addWidget(self.__lx_loader.browse_btn, self.Lexicon, 4)
self.layout().addWidget(self.__lx_loader.load_btn, self.Lexicon, 5)
self.layout().addWidget(self.__edit, self.Regexp, 1, 1, 5)
spins = self.__range_spins.spins()
self.layout().addWidget(spins[0], self.DocFreq, 1)
self.layout().addWidget(spins[1], self.DocFreq, 2)
spins = self.__rel_range_spins.spins()
self.layout().addWidget(rel_freq_rb, self.DocFreq, 1)
self.layout().addWidget(spins[0], self.DocFreq, 2)
self.layout().addWidget(spins[1], self.DocFreq, 3)
spins = self.__abs_range_spins.spins()
self.layout().addWidget(abs_freq_rb, self.DummyDocFreq, 1)
self.layout().addWidget(spins[0], self.DummyDocFreq, 2)
self.layout().addWidget(spins[1], self.DummyDocFreq, 3)
title = self.layout().itemAtPosition(self.DummyDocFreq, 0).widget()
title.hide()
self.layout().addWidget(self.__spin_n, self.MostFreq, 1)
self.layout().setColumnStretch(3, 1)

Expand Down Expand Up @@ -599,8 +626,19 @@ def __edit_finished(self):
if self.Regexp in self.methods:
self.edited.emit()

def __spins_edited(self):
if self.DocFreq in self.methods:
def __freq_group_clicked(self):
i = self.__freq_group.checkedId()
if self.__freq_type != i:
self.__set_freq_type(i)
if self.DocFreq in self.methods:
self.edited.emit()

def __rel_spins_edited(self):
if self.DocFreq in self.methods and self.__freq_type == 0:
self.edited.emit()

def __abs_spins_edited(self):
if self.DocFreq in self.methods and self.__freq_type == 1:
self.edited.emit()

def __spin_n_edited(self):
Expand All @@ -618,8 +656,15 @@ def setParameters(self, params: Dict):
self.__set_lx_path(params.get("lx_path", self.DEFAULT_NONE),
params.get("lx_list", []))
self.__set_pattern(params.get("pattern", self.DEFAULT_PATTERN))
self.__set_freq_range(params.get("start", self.DEFAULT_START),
params.get("end", self.DEFAULT_END))
self.__set_freq_type(params.get("freq_type", self.DEFAULT_FREQ_TYPE))
self.__set_rel_freq_range(
params.get("rel_start", self.DEFAULT_REL_START),
params.get("rel_end", self.DEFAULT_REL_END)
)
self.__set_abs_freq_range(
params.get("abs_start", self.DEFAULT_ABS_START),
params.get("abs_end", self.DEFAULT_ABS_END)
)
self.__set_n_tokens(params.get("n_tokens", self.DEFAULT_N_TOKEN))
self.__invalidated = False

Expand Down Expand Up @@ -649,19 +694,40 @@ def __set_pattern(self, pattern: str):
self.__edit.setText(pattern)
self.changed.emit()

def __set_freq_range(self, start: float, end: float):
self.__set_freq_start(start)
self.__set_freq_end(end)
self.__range_spins.set_range(start, end)
def __set_freq_type(self, freq_type: int):
if self.__freq_type != freq_type:
self.__freq_type = freq_type
self.__freq_group.button(self.__freq_type).setChecked(True)
self.changed.emit()

def __set_freq_start(self, n: float):
if self.__freq_st != n:
self.__freq_st = n
def __set_rel_freq_range(self, start: float, end: float):
self.__set_rel_freq_start(start)
self.__set_rel_freq_end(end)
self.__rel_range_spins.set_range(start, end)

def __set_rel_freq_start(self, n: float):
if self.__rel_freq_st != n:
self.__rel_freq_st = n
self.changed.emit()

def __set_freq_end(self, n: float):
if self.__freq_en != n:
self.__freq_en = n
def __set_rel_freq_end(self, n: float):
if self.__rel_freq_en != n:
self.__rel_freq_en = n
self.changed.emit()

def __set_abs_freq_range(self, start: int, end: int):
self.__set_abs_freq_start(start)
self.__set_abs_freq_end(end)
self.__abs_range_spins.set_range(start, end)

def __set_abs_freq_start(self, n: int):
if self.__abs_freq_st != n:
self.__abs_freq_st = n
self.changed.emit()

def __set_abs_freq_end(self, n: int):
if self.__abs_freq_en != n:
self.__abs_freq_en = n
self.changed.emit()

def __set_n_tokens(self, n: int):
Expand All @@ -678,8 +744,11 @@ def parameters(self) -> Dict:
"lx_path": self.__lx_file,
"lx_list": self.__lx_loader.recent_paths,
"pattern": self.__pattern,
"start": self.__freq_st,
"end": self.__freq_en,
"freq_type": self.__freq_type,
"rel_start": self.__rel_freq_st,
"rel_end": self.__rel_freq_en,
"abs_start": self.__abs_freq_st,
"abs_end": self.__abs_freq_en,
"n_tokens": self.__n_token,
"invalidated": self.__invalidated})
return params
Expand All @@ -703,8 +772,12 @@ def map_none(s):
pattern = params.get("pattern", FilteringModule.DEFAULT_PATTERN)
filters.append(RegexpFilter(pattern=pattern))
if FilteringModule.DocFreq in methods:
st = params.get("start", FilteringModule.DEFAULT_START)
end = params.get("end", FilteringModule.DEFAULT_END)
if params.get("freq_type", FilteringModule.DEFAULT_FREQ_TYPE) == 0:
st = params.get("rel_start", FilteringModule.DEFAULT_REL_START)
end = params.get("rel_end", FilteringModule.DEFAULT_REL_END)
else:
st = params.get("abs_start", FilteringModule.DEFAULT_ABS_START)
end = params.get("abs_end", FilteringModule.DEFAULT_ABS_END)
filters.append(FrequencyFilter(min_df=st, max_df=end))
if FilteringModule.MostFreq in methods:
n = params.get("n_tokens", FilteringModule.DEFAULT_N_TOKEN)
Expand All @@ -722,7 +795,10 @@ def __repr__(self):
elif method == self.Regexp:
append = f"{self.__pattern}"
elif method == self.DocFreq:
append = f"[{self.__freq_st}, {self.__freq_en}]"
if self.__freq_type == 0:
append = f"[{self.__rel_freq_st}, {self.__rel_freq_en}]"
else:
append = f"[{self.__abs_freq_st}, {self.__abs_freq_en}]"
elif method == self.MostFreq:
append = f"{self.__n_token}"
texts.append(f"{self.Methods[method].name} ({append})")
Expand Down Expand Up @@ -834,7 +910,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess,
priority = 200
keywords = []

settings_version = 2
settings_version = 3

class Inputs:
corpus = Input("Corpus", Corpus)
Expand Down Expand Up @@ -1115,6 +1191,21 @@ def str_into_paths(label):

settings["storedsettings"]["preprocessors"] = preprocessors

if version < 3:
preprocessors = settings["storedsettings"]["preprocessors"]
for pp_name, pp_settings in preprocessors:
if pp_name == "preprocess.filter":
start = pp_settings["start"]
end = pp_settings["end"]
if end <= 1:
pp_settings["rel_start"] = start
pp_settings["rel_end"] = end
else:
pp_settings["abs_start"] = start
pp_settings["abs_end"] = end
del pp_settings["start"]
del pp_settings["end"]


if __name__ == "__main__":
from Orange.widgets.utils.widgetpreview import WidgetPreview
Expand Down
45 changes: 34 additions & 11 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ def test_migrate_settings_filter(self):
{"methods": [0, 2, 4], "language": "Finnish",
"sw_path": None, "sw_list": [],
"lx_path": None, "lx_list": [],
"pattern": "foo", "start": 0.3, "end": 0.5, "n_tokens": 50}
"pattern": "foo", "rel_start": 0.3,
"rel_end": 0.5, "n_tokens": 50}
)]
self.assertEqual(widget.storedsettings["preprocessors"], params)

Expand Down Expand Up @@ -442,8 +443,16 @@ def line_edit(self):
return self.editor._FilteringModule__edit

@property
def spins(self):
return self.editor._FilteringModule__range_spins.spins()
def group_buttons(self):
return self.editor._FilteringModule__freq_group.buttons()

@property
def rel_spins(self):
return self.editor._FilteringModule__rel_range_spins.spins()

@property
def abs_spins(self):
return self.editor._FilteringModule__abs_range_spins.spins()

@property
def spin(self):
Expand All @@ -460,16 +469,23 @@ def test_init(self):
self.assertEqual(self.sw_combo.currentText(), "(none)")
self.assertEqual(self.lx_combo.currentText(), "(none)")
self.assertEqual(self.line_edit.text(), FilteringModule.DEFAULT_PATTERN)
self.assertEqual(self.spins[0].value(), 0.1)
self.assertEqual(self.spins[1].value(), 0.9)
self.assertTrue(self.group_buttons[0].isChecked())
self.assertFalse(self.group_buttons[1].isChecked())
self.assertEqual(self.rel_spins[0].value(), 0.1)
self.assertEqual(self.rel_spins[1].value(), 0.9)
self.assertEqual(self.abs_spins[0].value(), 1)
self.assertEqual(self.abs_spins[1].value(), 10)
self.assertEqual(self.spin.value(), 100)

def test_parameters(self):
params = {"methods": [FilteringModule.Stopwords],
"language": "English", "sw_path": None, "lx_path": None,
"sw_list": [], "lx_list": [],
"pattern": FilteringModule.DEFAULT_PATTERN, "start": 0.1,
"end": 0.9, "n_tokens": 100, "invalidated": False}
"pattern": FilteringModule.DEFAULT_PATTERN,
"freq_type": 0,
"rel_start": 0.1, "rel_end": 0.9,
"abs_start": 1, "abs_end": 10,
"n_tokens": 100, "invalidated": False}
self.assertDictEqual(self.editor.parameters(), params)

def test_set_parameters(self):
Expand All @@ -479,8 +495,11 @@ def test_set_parameters(self):
"language": "Finnish",
"sw_path": sw_path, "lx_path": lx_path,
"sw_list": [sw_path], "lx_list": [lx_path],
"pattern": "foo", "start": 0.2, "end": 0.7, "n_tokens": 10,
"invalidated": False}
"pattern": "foo",
"freq_type": 1,
"rel_start": 0.2, "rel_end": 0.7,
"abs_start": 2, "abs_end": 15,
"n_tokens": 10, "invalidated": False}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)

Expand All @@ -495,8 +514,12 @@ def test_set_parameters(self):
self.assertEqual(self.sw_combo.currentText(), "Foo")
self.assertEqual(self.lx_combo.currentText(), "Bar")
self.assertEqual(self.line_edit.text(), "foo")
self.assertEqual(self.spins[0].value(), 0.2)
self.assertEqual(self.spins[1].value(), 0.7)
self.assertFalse(self.group_buttons[0].isChecked())
self.assertTrue(self.group_buttons[1].isChecked())
self.assertEqual(self.rel_spins[0].value(), 0.2)
self.assertEqual(self.rel_spins[1].value(), 0.7)
self.assertEqual(self.abs_spins[0].value(), 2)
self.assertEqual(self.abs_spins[1].value(), 15)
self.assertEqual(self.spin.value(), 10)

def test_createinstance(self):
Expand Down