Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Word Cloud: add document count to Word Count output #1050

Merged
merged 2 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions orangecontrib/text/widgets/owwordcloud.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
from collections import Counter
from collections import Counter, defaultdict
from itertools import cycle
from math import pi as PI
from typing import Dict, List, Tuple
Expand Down Expand Up @@ -42,7 +42,8 @@ def _bow_words(corpus):
return {f: w for f, w in average_bows.items() if w > 0}


def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]:
def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, Counter,
bool]:
"""
This function implements counting process of the word cloud widget and
is called in the separate thread by concurrent.
Expand All @@ -69,8 +70,13 @@ def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]:
corpus_counter = Counter(
w for doc in data.ngrams for w in doc
)
word_documents = defaultdict(int)
for doc in data.ngrams:
words = set(w for w in doc)
for word in words:
word_documents[word] += 1
state.set_progress_value(1)
return corpus_counter, bool(bow_counts)
return corpus_counter, Counter(word_documents), bool(bow_counts)


class TableModel(PyTableModel):
Expand Down Expand Up @@ -401,36 +407,38 @@ def word_frequencies(self):
words, freq = zip(*counts) if counts else ([], [])
return words, freq

def create_weight_list(self):
def create_weight_list(self, document_counter):
wc_table = None
if self.corpus is not None:
words, freq = self.word_frequencies()
doc_freq = [document_counter[w] for w in words]
words = np.array(words)[:, None]
w_count = np.array(freq)[:, None]
d_count = np.array(doc_freq)[:, None]
domain = Domain(
[ContinuousVariable("Word Count")],
[ContinuousVariable("Word Count"), ContinuousVariable(
"Document Count")],
metas=[StringVariable("Word")],
)
wc_table = Table.from_numpy(domain, X=w_count, metas=words)
wc_table = Table.from_numpy(domain, X=np.hstack((w_count, d_count)),
metas=words)
wc_table.name = "Word Counts"
self.Outputs.word_counts.send(wc_table)

@Inputs.corpus
def on_corpus_change(self, data):
self.corpus = data
self.Info.clear()

self.corpus_counter = Counter()
if data is not None:
self.start(count_words, data)
else:
self.handle_input()
self.create_weight_list()

def on_done(self, result: Tuple[Counter, bool]) -> None:
def on_done(self, result: Tuple[Counter, Counter, bool]) -> None:
self.corpus_counter = result[0]
self.create_weight_list()
if result[1]:
self.create_weight_list(result[1])
if result[2]:
self.Info.bow_weights()
self.handle_input()

Expand Down
13 changes: 11 additions & 2 deletions orangecontrib/text/widgets/tests/test_owwordcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def test_bow_features(self):
np.testing.assert_array_almost_equal(weights, [1, 2, 2])

output = self.get_output(self.widget.Outputs.word_counts)
np.testing.assert_array_almost_equal([2, 2, 1], output.X.flatten())
np.testing.assert_array_almost_equal([2, 2, 1], output.X[:,
0].flatten())
np.testing.assert_array_equal(
["Word3", "Word2", "Word1"], output.metas.flatten())
self.assertTupleEqual(
Expand All @@ -93,7 +94,7 @@ def test_bow_features(self):
np.testing.assert_array_almost_equal(weights, [1, 2])

output = self.get_output(self.widget.Outputs.word_counts)
np.testing.assert_array_almost_equal([2, 1], output.X.flatten())
np.testing.assert_array_almost_equal([2, 1], output.X[:, 0].flatten())
np.testing.assert_array_equal(
["Word2", "Word1"], output.metas.flatten())
self.assertTupleEqual(
Expand Down Expand Up @@ -168,6 +169,14 @@ def test_select_words_output(self):
self.assertEqual(2, len(output))
self.assertEqual("words", output.domain["Words"].attributes["type"])

def test_word_counts_output(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
output = self.get_output(self.widget.Outputs.word_counts)
words = set(w for doc in list(self.corpus.ngrams) for w in doc)
self.assertEqual(len(words), len(output))
self.assertLessEqual(output.X[:, 1][0], len(self.corpus))
self.assertEqual(len(output.domain), 3)


if __name__ == "__main__":
unittest.main()
Loading