Skip to content

Commit

Permalink
Corpus: add title attribute selection dropdown
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 16, 2019
1 parent 3f01f69 commit ae52672
Show file tree
Hide file tree
Showing 4 changed files with 240 additions and 18 deletions.
10 changes: 1 addition & 9 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,7 @@ def titles(self):
""" Returns a list of titles. """
attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
if attr.attributes.get('title', False)]
# Alternatively, use heuristics
if not attrs:
for var in sorted(chain(self.domain.metas, self.domain.variables),
key=lambda var: var.name,
reverse=True): # reverse so that title < heading < filename
if var.name.lower() in ('title', 'heading', 'h1', 'filename') \
and not var.attributes.get('hidden', False): # skip BoW features
attrs = [var]
break

if attrs:
return self.documents_from_features(attrs)
else:
Expand Down
7 changes: 0 additions & 7 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,13 +163,6 @@ def test_titles(self):
for title in titles:
self.assertIn('Document ', title)

# inferred title from heuristics
expected = list(map(str, range(len(c))))
c2 = Corpus(Domain([], [], (StringVariable('heading'),)),
None, None, np.c_[expected])
titles = c2.titles
self.assertEqual(titles, expected)

# title feature set
c.domain[0].attributes['title'] = True
titles = c.titles
Expand Down
76 changes: 74 additions & 2 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import numpy as np

from Orange.data import Table
from Orange.data import Table, StringVariable, Variable
from Orange.data.io import FileFormat
from Orange.widgets import gui
from Orange.widgets.utils.itemmodels import VariableListModel
from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel
from Orange.widgets.data.owselectcolumns import VariablesListItemView
from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler
from Orange.widgets.widget import OWWidget, Msg, Input, Output
Expand Down Expand Up @@ -46,6 +47,7 @@ class Outputs:
"andersen.tab",
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")

class Error(OWWidget.Error):
read_file = Msg("Can't read file {} ({})")
Expand Down Expand Up @@ -73,6 +75,15 @@ def __init__(self):
self.info_label = gui.label(ibox, self, "")
self.update_info()

# dropdown to select title variable
self.title_model = DomainModel(
valid_types=(StringVariable,), placeholder="(no title)")
gui.comboBox(
self.controlArea, self, "title_variable",
box="Title variable", model=self.title_model,
callback=self.update_feature_selection
)

# Used Text Features
fbox = gui.widgetBox(self.controlArea, orientation=0)
ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
Expand Down Expand Up @@ -138,6 +149,7 @@ def open_file(self, path=None, data=None):
return

self.update_info()
self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
self.Error.corpus_without_text_features()
Expand All @@ -149,6 +161,56 @@ def open_file(self, path=None, data=None):
[f for f in self.corpus.domain.metas
if f.is_string and f not in self.used_attrs_model])

def _setup_title_dropdown(self):
self.title_model.set_domain(self.corpus.domain)

# if title variable is already marked in a dataset set it as a title
# variable
title_var = list(filter(
lambda x: x.attributes.get("title", False),
self.corpus.domain.metas))
if title_var:
self.title_variable = title_var[0]
return

# if not title attribute use heuristic for selecting it
v_len = np.vectorize(len)
first_selection = (None, 0) # value, uniqueness
second_selection = (None, 100) # value, avg text length

variables = [v for v in self.title_model
if v is not None and isinstance(v, Variable)]

for variable in sorted(
variables, key=lambda var: var.name, reverse=True):
# if there is title, heading, or filename attribute in corpus
# heuristic should select them -
# in order title > heading > filename - this is why we use sort
if str(variable).lower() in ('title', 'heading', 'filename'):
first_selection = (variable, 0)
break

# otherwise uniqueness and length counts
column_values = self.corpus.get_column_view(variable)[0]
average_text_length = v_len(column_values).mean()
uniqueness = len(np.unique(column_values))

# if the variable is short enough to be a title select one with
# the highest number of unique values
if uniqueness > first_selection[1] and average_text_length <= 30:
first_selection = (variable, uniqueness)
# else select the variable with shortest average text that is
# shorter than 100 (if all longer than 100 leave empty)
elif average_text_length < second_selection[1]:
second_selection = (variable, average_text_length)

if first_selection[0] is not None:
self.title_variable = first_selection[0]
elif second_selection[0] is not None:
self.title_variable = second_selection[0]
else:
self.title_variable = None

def update_info(self):
def describe(corpus):
dom = corpus.domain
Expand Down Expand Up @@ -194,13 +256,23 @@ def remove_duplicates(l):
if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
self.Error.no_text_features_used()

self._set_title_attribute()
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
or len(self.corpus) == 0 \
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)

def _set_title_attribute(self):
# remove all title attributes
for a in self.corpus.domain.variables + self.corpus.domain.metas:
a.attributes.pop("title", None)

if self.title_variable and self.title_variable in self.corpus.domain:
self.corpus.domain[
self.title_variable].attributes["title"] = True

def send_report(self):
def describe(features):
if len(features):
Expand Down
165 changes: 165 additions & 0 deletions orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import numpy as np
from Orange.data import Table, Domain, StringVariable
from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text import Corpus
from orangecontrib.text.widgets.owcorpus import OWCorpus


class TestOWCorpus(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWCorpus)

def check_output(self, sel_title):
"""
This function check whether the `sel_title` variable has a title true
in the output
"""
output = self.get_output(self.widget.Outputs.corpus)
for attr in output.domain.variables + output.domain.metas:
if str(attr) == sel_title:
# sel_title attribute must be marked as a title
self.assertTrue(attr.attributes.get("title", False))
else:
# others must not be marked as a title
self.assertFalse(attr.attributes.get("title", False))

def test_title_combo(self):
# default corpus dataset
self.assertEqual(self.widget.corpus.name, "book-excerpts")

options = self.widget.title_model[:]
self.assertIn(self.widget.corpus.domain["Text"], options)
# for this dataset no title variable is selected
self.assertEqual(None, self.widget.title_variable)
self.check_output(None)

def test_title_already_in_dataset(self):
"""
This dataset already have the title attribute so the title option
is set to this attribute by default
"""
# default corpus dataset
data = Corpus.from_file("election-tweets-2016")
self.send_signal(self.widget.Inputs.data, data)

self.assertEqual(data.domain["Content"], self.widget.title_variable)
self.check_output("Content")

def test_title_selection_strategy_title_heading(self):
"""
When a there is a title, heading, filename attribute, select this one
as a default title.
"""
data = Table(
Domain([], metas=[StringVariable("title"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["title"], self.widget.title_variable)
self.check_output("title")

data = Table(
Domain([], metas=[StringVariable("Title"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Title"], self.widget.title_variable)
self.check_output("Title")

# when title and heading present first select title
data = Table(
Domain([], metas=[
StringVariable("Title"),
StringVariable("Heading"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Title"], self.widget.title_variable)
self.check_output("Title")

data = Table(
Domain([], metas=[
StringVariable("Heading"),
StringVariable("Title"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Title"], self.widget.title_variable)
self.check_output("Title")

data = Table(
Domain([], metas=[
StringVariable("Heading"),
StringVariable("Filename"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Heading"], self.widget.title_variable)
self.check_output("Heading")

def test_title_selection_strategy(self):
"""
With this test we test whether the selection strategy for a title
attribute works correctly
"""
# select the most unique
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b")]),
np.empty((3, 0)),
metas=[["a" * 10, "a" * 10],
["a" * 10, "b" * 10],
["a" * 10, "c" * 10]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["b"], self.widget.title_variable)
self.check_output("b")

# select the uniquest and also short enough, here attribute a is not
# suitable since it has too long title, and c is more unique than b
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 10, "a" * 10],
["b" * 100, "a" * 10, "b" * 10],
["c" * 100, "a" * 10, "b" * 10]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["c"], self.widget.title_variable)
self.check_output("c")

# when no variable is short enough we just select the shortest
# attribute
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["c"], self.widget.title_variable)
self.check_output("c")

0 comments on commit ae52672

Please sign in to comment.