Skip to content

Commit

Permalink
Corpus: move dataset info in the status bar
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 19, 2019
1 parent 17cb470 commit 20fbf04
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 17 deletions.
38 changes: 22 additions & 16 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,6 @@ def __init__(self):
)
fbox.layout().addWidget(self.file_widget)

# Corpus info
ibox = gui.widgetBox(self.controlArea, "Corpus info", addSpace=True)
self.info_label = gui.label(ibox, self, "")
self.update_info()

# dropdown to select title variable
self.title_model = DomainModel(
valid_types=(StringVariable,), placeholder="(no title)")
Expand Down Expand Up @@ -115,6 +110,8 @@ def __init__(self):

# load first file
self.file_widget.select(0)
self.update_output_info()
self.update_input_info(None)

def sizeHint(self):
return QSize(400, 300)
Expand All @@ -127,6 +124,8 @@ def set_data(self, data):
self.file_widget.setEnabled(not have_data)
self.browse_documentation.setEnabled(not have_data)

self.update_input_info(data)

if have_data:
self.open_file(data=data)
else:
Expand All @@ -148,7 +147,7 @@ def open_file(self, path=None, data=None):
else:
return

self.update_info()
self.update_output_info()
self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
Expand Down Expand Up @@ -211,31 +210,38 @@ def _setup_title_dropdown(self):
else:
self.title_variable = None

def update_info(self):
def update_output_info(self):
def describe(corpus):
dom = corpus.domain
text_feats = sum(m.is_string for m in dom.metas)
other_feats = len(dom.attributes) + len(dom.metas) - text_feats
text = \
"{} document(s), {} text features(s), {} other feature(s).". \
"{} document(s)\n{} text features(s)\n{} other feature(s)". \
format(len(corpus), text_feats, other_feats)
if dom.has_continuous_class:
text += "<br/>Regression; numerical class."
text += "\nRegression; numerical class."
elif dom.has_discrete_class:
text += "<br/>Classification; discrete class with {} values.". \
text += "\nClassification; discrete class with {} values.". \
format(len(dom.class_var.values))
elif corpus.domain.class_vars:
text += "<br/>Multi-target; {} target variables.".format(
text += "\nMulti-target; {} target variables.".format(
len(corpus.domain.class_vars))
else:
text += "<br/>Data has no target variable."
text += "</p>"
return text

if self.corpus is None:
self.info_label.setText("No corpus loaded.")
self.info.set_output_summary(self.info.NoOutput)
else:
self.info.set_output_summary(
str(len(self.corpus)), describe(self.corpus))

def update_input_info(self, data):
if data:
self.info.set_input_summary(
str(len(data)),
f"{len(data)} data instance{'s' if len(data) > 1 else ''}"
f" on input")
else:
self.info_label.setText(describe(self.corpus))
self.info.set_input_summary(self.info.NoInput)

def update_feature_selection(self):
self.Error.no_text_features_used.clear()
Expand Down
69 changes: 68 additions & 1 deletion orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from unittest.mock import Mock, patch

import numpy as np
from Orange.data import Table, Domain, StringVariable
from Orange.data import Table, Domain, StringVariable, ContinuousVariable
from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text import Corpus
Expand Down Expand Up @@ -163,3 +165,68 @@ def test_title_selection_strategy(self):
self.assertEqual(data.domain["c"], self.widget.title_variable)
self.check_output("c")

def test_input_status(self):
"""
Test input, output info
"""
data = Corpus.from_file("election-tweets-2016")
input_sum = self.widget.info.set_input_summary = Mock()

self.send_signal(self.widget.Inputs.data, data)
input_sum.assert_called_with(
str(len(data)), f"{len(data)} data instances on input")
input_sum.reset_mock()
self.send_signal(self.widget.Inputs.data, data[:1])
input_sum.assert_called_with("1", "1 data instance on input")
input_sum.reset_mock()

self.send_signal(self.widget.Inputs.data, None)
input_sum.assert_called_with(self.widget.info.NoInput)
input_sum.reset_mock()

def test_output_status(self):
"""
Test input, output info
"""
# when input signal
data = Corpus.from_file("election-tweets-2016")
out_sum = self.widget.info.set_output_summary = Mock()

self.send_signal(self.widget.Inputs.data, data)
out_sum.assert_called_with(
str(len(data)),
"6444 document(s)\n4 text features(s)\n7 other feature(s)\n"
"Classification; discrete class with 2 values.")
out_sum.reset_mock()

# corpus without class
data1 = Corpus(Domain(data.domain.attributes, metas=data.domain.metas),
data.X, metas=data.metas,
text_features=data.text_features)
self.send_signal(self.widget.Inputs.data, data1)
out_sum.assert_called_with(
str(len(data)),
"6444 document(s)\n4 text features(s)\n7 other feature(s)")
out_sum.reset_mock()

# corpus with continuous class
data1 = Corpus(Domain(data.domain.attributes,
ContinuousVariable("a"),
metas=data.domain.metas),
data.X, np.random.rand(len(data), 1),
metas=data.metas,
text_features=data.text_features)
self.send_signal(self.widget.Inputs.data, data1)
out_sum.assert_called_with(
str(len(data)),
"6444 document(s)\n4 text features(s)\n7 other feature(s)\n"
"Regression; numerical class.")
out_sum.reset_mock()

# default dataset is on the output
self.send_signal(self.widget.Inputs.data, None)
out_sum.assert_called_with(
"140",
"140 document(s)\n1 text features(s)\n0 other feature(s)\n"
"Classification; discrete class with 2 values.")
out_sum.reset_mock()

0 comments on commit 20fbf04

Please sign in to comment.