Skip to content

Commit

Permalink
Corpus: add title attribute selection dropdown
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 11, 2019
1 parent 3f01f69 commit e0500e5
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 2 deletions.
67 changes: 65 additions & 2 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import numpy as np

from Orange.data import Table
from Orange.data import Table, StringVariable, Variable
from Orange.data.io import FileFormat
from Orange.widgets import gui
from Orange.widgets.utils.itemmodels import VariableListModel
from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel
from Orange.widgets.data.owselectcolumns import VariablesListItemView
from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler
from Orange.widgets.widget import OWWidget, Msg, Input, Output
Expand Down Expand Up @@ -46,6 +47,7 @@ class Outputs:
"andersen.tab",
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")

class Error(OWWidget.Error):
read_file = Msg("Can't read file {} ({})")
Expand Down Expand Up @@ -73,6 +75,15 @@ def __init__(self):
self.info_label = gui.label(ibox, self, "")
self.update_info()

# dropdown to select title variable
self.title_model = DomainModel(
valid_types=(StringVariable,), placeholder="(no title)")
gui.comboBox(
self.controlArea, self, "title_variable",
box="Title variable", model=self.title_model,
callback=self.update_feature_selection
)

# Used Text Features
fbox = gui.widgetBox(self.controlArea, orientation=0)
ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
Expand Down Expand Up @@ -138,6 +149,7 @@ def open_file(self, path=None, data=None):
return

self.update_info()
self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
self.Error.corpus_without_text_features()
Expand All @@ -149,6 +161,47 @@ def open_file(self, path=None, data=None):
[f for f in self.corpus.domain.metas
if f.is_string and f not in self.used_attrs_model])

def _setup_title_dropdown(self):
self.title_model.set_domain(self.corpus.domain)

# if title variable is already marked in a dataset set it as a title
# variable
title_var = list(filter(
lambda x: x.attributes.get("title", False),
self.corpus.domain.metas))
if title_var:
self.title_variable = title_var[0]
return

v_len = np.vectorize(len)
first_selection = (None, 0) # value, uniqueness
second_selection = (None, 100) # value, avg text length

for variable in self.title_model:
if variable is None or not isinstance(variable, Variable):
# skip none and separator from the dropdown
continue
column_values = self.corpus.get_column_view(variable)[0]

average_text_length = v_len(column_values).mean()
uniqueness = len(np.unique(column_values))

# if the variable is short enough to be a title select one with
# the highest number of unique values
if uniqueness > first_selection[1] and average_text_length <= 30:
first_selection = (variable, uniqueness)
# else select the variable with shortest average text that is
# shorter than 100 (if all longer than 100 leave empty)
elif average_text_length < second_selection[1]:
second_selection = (variable, average_text_length)

if first_selection[0] is not None:
self.title_variable = first_selection[0]
elif second_selection[0] is not None:
self.title_variable = second_selection[0]
else:
self.title_variable = None

def update_info(self):
def describe(corpus):
dom = corpus.domain
Expand Down Expand Up @@ -194,13 +247,23 @@ def remove_duplicates(l):
if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
self.Error.no_text_features_used()

self._set_title_attribute()
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
or len(self.corpus) == 0 \
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)

def _set_title_attribute(self):
# remove all title attributes
for a in self.corpus.domain.variables + self.corpus.domain.metas:
a.attributes.pop("title", None)

if self.title_variable and self.title_variable in self.corpus.domain:
self.corpus.domain[
self.title_variable].attributes["title"] = True

def send_report(self):
def describe(features):
if len(features):
Expand Down
93 changes: 93 additions & 0 deletions orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import numpy as np
from Orange.data import Table, Domain, StringVariable
from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text import Corpus
from orangecontrib.text.widgets.owcorpus import OWCorpus


class TestOWCorpus(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWCorpus)

def check_output(self, sel_title):
"""
This function check whether the `sel_title` variable has a title true
in the output
"""
output = self.get_output(self.widget.Outputs.corpus)
for attr in output.domain.variables + output.domain.metas:
if str(attr) == sel_title:
# sel_title attribute must be marked as a title
self.assertTrue(attr.attributes.get("title", False))
else:
# others must not be marked as a title
self.assertFalse(attr.attributes.get("title", False))

def test_title_combo(self):
# default corpus dataset
self.assertEqual(self.widget.corpus.name, "book-excerpts")

options = self.widget.title_model[:]
self.assertIn(self.widget.corpus.domain["Text"], options)
# for this dataset no title variable is selected
self.assertEqual(None, self.widget.title_variable)
self.check_output(None)

def test_title_already_in_dataset(self):
"""
This dataset already have the title attribute so the title option
is set to this attribute by default
"""
# default corpus dataset
data = Corpus.from_file("election-tweets-2016")
self.send_signal(self.widget.Inputs.data, data)

self.assertEqual(data.domain["Content"], self.widget.title_variable)
self.check_output("Content")

def test_title_selection_strategy(self):
"""
With this test we test whether the selection strategy for a title
attribute works correctly
"""
# select the most unique
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b")]),
np.empty((3, 0)),
metas=[["a" * 10, "a" * 10],
["a" * 10, "b" * 10],
["a" * 10, "c" * 10]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["b"], self.widget.title_variable)
self.check_output("b")

# select the uniquest and also short enough, here attribute a is not
# suitable since it has too long title, and c is more unique than b
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 10, "a" * 10],
["b" * 100, "a" * 10, "b" * 10],
["c" * 100, "a" * 10, "b" * 10]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["c"], self.widget.title_variable)
self.check_output("c")

# when no variable is short enough we just select the shortest
# attribute
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["c"], self.widget.title_variable)
self.check_output("c")

0 comments on commit e0500e5

Please sign in to comment.