Skip to content

Commit

Permalink
Merge pull request #1050 from ajdapretnar/wc-contains
Browse files Browse the repository at this point in the history
Word Cloud: add document count to Word Count output
  • Loading branch information
ajdapretnar authored Jul 9, 2024
2 parents 101f30e + 0f3d48d commit d648299
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 13 deletions.
30 changes: 19 additions & 11 deletions orangecontrib/text/widgets/owwordcloud.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
from collections import Counter
from collections import Counter, defaultdict
from itertools import cycle
from math import pi as PI
from typing import Dict, List, Tuple
Expand Down Expand Up @@ -42,7 +42,8 @@ def _bow_words(corpus):
return {f: w for f, w in average_bows.items() if w > 0}


def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]:
def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, Counter,
bool]:
"""
This function implements counting process of the word cloud widget and
is called in the separate thread by concurrent.
Expand All @@ -69,8 +70,13 @@ def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]:
corpus_counter = Counter(
w for doc in data.ngrams for w in doc
)
word_documents = defaultdict(int)
for doc in data.ngrams:
words = set(w for w in doc)
for word in words:
word_documents[word] += 1
state.set_progress_value(1)
return corpus_counter, bool(bow_counts)
return corpus_counter, Counter(word_documents), bool(bow_counts)


class TableModel(PyTableModel):
Expand Down Expand Up @@ -401,36 +407,38 @@ def word_frequencies(self):
words, freq = zip(*counts) if counts else ([], [])
return words, freq

def create_weight_list(self):
def create_weight_list(self, document_counter):
wc_table = None
if self.corpus is not None:
words, freq = self.word_frequencies()
doc_freq = [document_counter[w] for w in words]
words = np.array(words)[:, None]
w_count = np.array(freq)[:, None]
d_count = np.array(doc_freq)[:, None]
domain = Domain(
[ContinuousVariable("Word Count")],
[ContinuousVariable("Word Count"), ContinuousVariable(
"Document Count")],
metas=[StringVariable("Word")],
)
wc_table = Table.from_numpy(domain, X=w_count, metas=words)
wc_table = Table.from_numpy(domain, X=np.hstack((w_count, d_count)),
metas=words)
wc_table.name = "Word Counts"
self.Outputs.word_counts.send(wc_table)

@Inputs.corpus
def on_corpus_change(self, data):
self.corpus = data
self.Info.clear()

self.corpus_counter = Counter()
if data is not None:
self.start(count_words, data)
else:
self.handle_input()
self.create_weight_list()

def on_done(self, result: Tuple[Counter, bool]) -> None:
def on_done(self, result: Tuple[Counter, Counter, bool]) -> None:
self.corpus_counter = result[0]
self.create_weight_list()
if result[1]:
self.create_weight_list(result[1])
if result[2]:
self.Info.bow_weights()
self.handle_input()

Expand Down
13 changes: 11 additions & 2 deletions orangecontrib/text/widgets/tests/test_owwordcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def test_bow_features(self):
np.testing.assert_array_almost_equal(weights, [1, 2, 2])

output = self.get_output(self.widget.Outputs.word_counts)
np.testing.assert_array_almost_equal([2, 2, 1], output.X.flatten())
np.testing.assert_array_almost_equal([2, 2, 1], output.X[:,
0].flatten())
np.testing.assert_array_equal(
["Word3", "Word2", "Word1"], output.metas.flatten())
self.assertTupleEqual(
Expand All @@ -93,7 +94,7 @@ def test_bow_features(self):
np.testing.assert_array_almost_equal(weights, [1, 2])

output = self.get_output(self.widget.Outputs.word_counts)
np.testing.assert_array_almost_equal([2, 1], output.X.flatten())
np.testing.assert_array_almost_equal([2, 1], output.X[:, 0].flatten())
np.testing.assert_array_equal(
["Word2", "Word1"], output.metas.flatten())
self.assertTupleEqual(
Expand Down Expand Up @@ -168,6 +169,14 @@ def test_select_words_output(self):
self.assertEqual(2, len(output))
self.assertEqual("words", output.domain["Words"].attributes["type"])

def test_word_counts_output(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
output = self.get_output(self.widget.Outputs.word_counts)
words = set(w for doc in list(self.corpus.ngrams) for w in doc)
self.assertEqual(len(words), len(output))
self.assertLessEqual(output.X[:, 1][0], len(self.corpus))
self.assertEqual(len(output.domain), 3)


if __name__ == "__main__":
unittest.main()

0 comments on commit d648299

Please sign in to comment.