Merge pull request #501 from PrimozGodec/word-cloud-no-zeros

Word Cloud: Remove words with zero weights from word cloud
biolab · Feb 24, 2020 · ecb7162 · ecb7162
2 parents 9180817 + eeabcab
commit ecb7162
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 31 deletions.
diff --git a/orangecontrib/text/widgets/owwordcloud.py b/orangecontrib/text/widgets/owwordcloud.py
@@ -26,6 +26,35 @@
 N_BEST_PLOTTED = 200
 
 
+class TableModel(PyTableModel):
+    def __init__(self, precision, **kwargs):
+        super().__init__(**kwargs)
+        self.precision = precision
+
+    def data(self, index, role=Qt.DisplayRole):
+        """
+        Format numbers of the first column with the number of decimal
+        spaces defined by self.predictions which can be changed based on
+        weights type - row counts does not have decimal spaces
+        """
+        row, column = self.mapToSourceRows(index.row()), index.column()
+        if role == Qt.DisplayRole and column == 0:
+            value = float(self[row][column])
+            return f"{value:.{self.precision}f}"
+        return super().data(index, role)
+
+    def set_precision(self, precision: int):
+        """
+        Setter for precision.
+
+        Parameters
+        ----------
+        precision
+            Number of decimal spaces to format the weights.
+        """
+        self.precision = precision
+
+
 class OWWordCloud(widget.OWWidget):
     name = "Word Cloud"
     priority = 510
@@ -175,7 +204,7 @@ def update_selection(self, words):
                 self.__nope = False
 
         view = self.tableview = TableView(self)
-        model = self.tablemodel = PyTableModel(parent=self)
+        model = self.tablemodel = TableModel(2, parent=self)
         proxymodel = QSortFilterProxyModel(
             self,
             dynamicSortFilter=True,
@@ -255,10 +284,20 @@ def _repopulate_wordcloud(
         weights
             Words' weights
         """
+        def is_whole(d):
+            """Whether or not d is a whole number."""
+            return (
+                    isinstance(d, int)
+                    or (isinstance(d, float) and d.is_integer())
+            )
+
         words, weights = words[:N_BEST_PLOTTED], weights[:N_BEST_PLOTTED]
         self.shown_words, self.shown_weights = words, weights
 
         # Repopulate table
+        self.tablemodel.set_precision(
+            0 if all(is_whole(w) for w in weights) else 2
+        )
         self.tablemodel.wrap(list(zip(weights, words)))
         self.tableview.sortByColumn(0, Qt.DescendingOrder)
 
@@ -348,25 +387,13 @@ def _bow_words(self):
         This function extract words from bag of words features and assign them
         the frequency which is average bow count.
         """
-        bow_features = self._get_bow_variables()
-        if not bow_features:
-            return {}
-
         average_bows = {
-            f.name: self.corpus.get_column_view(f)[0].mean()
-            for f in bow_features
+            f.name: self.corpus.X[:, i].mean()
+            for i, f in enumerate(self.corpus.domain.attributes)
+            if f.attributes.get("bow-feature", False)
         }
-        return average_bows
-
-    def _get_bow_variables(self):
-        """
-        Extract bow variables from data
-        """
-        return [
-            var
-            for var in self.corpus.domain.variables
-            if var.attributes.get("bow-feature", False)
-        ]
+        # return only positive bow weights (those == 0 are non-existing words)
+        return {f: w for f, w in average_bows.items() if w > 0}
 
     def handleNewSignals(self):
         if self.topic is not None and len(self.topic):

diff --git a/orangecontrib/text/widgets/tests/test_owworldcloud.py b/orangecontrib/text/widgets/tests/test_owworldcloud.py
@@ -62,15 +62,20 @@ def test_bow_features(self):
             v.attributes["bow-feature"] = True
 
         self.send_signal(self.widget.Inputs.corpus, data)
-        self.assertDictEqual(
-            self.widget.corpus_counter, {"Word1": 1, "Word2": 2, "Word3": 2})
+        weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1]
+        # due to computation error in computing mean use array_almost_equal
+        np.testing.assert_array_almost_equal(weights, [1, 2, 2])
+
         output = self.get_output(self.widget.Outputs.word_counts)
-        np.testing.assert_array_equal([2, 2, 1], output.X.flatten())
+        np.testing.assert_array_almost_equal([2, 2, 1], output.X.flatten())
         np.testing.assert_array_equal(
-            ["Word2", "Word3", "Word1"], output.metas.flatten())
-        self.assertListEqual(
-            [(2.0, 'Word2'), (2.0, 'Word3'), (1.0, 'Word1')],
-            self.widget.tablemodel[:])
+            ["Word3", "Word2", "Word1"], output.metas.flatten())
+        self.assertTupleEqual(
+            ("Word3", "Word2", "Word1"),
+            list(zip(*self.widget.tablemodel[:]))[1])
+        np.testing.assert_array_almost_equal(
+            [2, 2, 1],
+            list(zip(*self.widget.tablemodel[:]))[0])
 
         # try with one word not bow-feature
         data = self.corpus[:3]
@@ -81,15 +86,19 @@ def test_bow_features(self):
             v.attributes["bow-feature"] = True
 
         self.send_signal(self.widget.Inputs.corpus, data)
-        self.assertDictEqual(
-            self.widget.corpus_counter, {"Word1": 1, "Word2": 2})
+        weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1]
+        np.testing.assert_array_almost_equal(weights, [1, 2])
+
         output = self.get_output(self.widget.Outputs.word_counts)
-        np.testing.assert_array_equal([2, 1], output.X.flatten())
+        np.testing.assert_array_almost_equal([2, 1], output.X.flatten())
         np.testing.assert_array_equal(
             ["Word2", "Word1"], output.metas.flatten())
-        self.assertListEqual(
-            [(2.0, 'Word2'), (1.0, 'Word1')],
-            self.widget.tablemodel[:])
+        self.assertTupleEqual(
+            ("Word2", "Word1"),
+            list(zip(*self.widget.tablemodel[:]))[1])
+        np.testing.assert_array_almost_equal(
+            [2, 1],
+            list(zip(*self.widget.tablemodel[:]))[0])
 
     def test_bow_info(self):
         """