diff --git a/.travis.yml b/.travis.yml index 42d238e14..5328adea0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,9 +26,9 @@ matrix: - &master python: '3.6' env: ORANGE="master" - - &orange3-21-0 + - &orange3-25-0 python: '3.7' - env: ORANGE="3.21.0" + env: ORANGE="3.25.0" env: global: diff --git a/doc/index.rst b/doc/index.rst index 4a4f31b9d..5a2969943 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -16,6 +16,7 @@ Widgets widgets/wikipedia-widget widgets/preprocesstext widgets/bagofwords-widget + widgets/documentembedding widgets/similarityhashing widgets/sentimentanalysis widgets/tweetprofiler diff --git a/doc/widgets.json b/doc/widgets.json index dd0fa18cf..8f4e9e255 100644 --- a/doc/widgets.json +++ b/doc/widgets.json @@ -146,6 +146,13 @@ "icon": "../orangecontrib/text/widgets/icons/Duplicates.svg", "background": "light-blue", "keywords": [] + }, + { + "text": "Document Embedding", + "doc": "widgets/documentembedding.md", + "icon": "../orangecontrib/text/widgets/icons/TextEmbedding.svg", + "background": "light-blue", + "keywords": [] } ] ] diff --git a/doc/widgets/documentembedding.md b/doc/widgets/documentembedding.md new file mode 100644 index 000000000..c92b0e784 --- /dev/null +++ b/doc/widgets/documentembedding.md @@ -0,0 +1,50 @@ +Document Embedding +================== + +Embeds documents from input corpus into vector space by using pretrained +[fastText](https://fasttext.cc/docs/en/crawl-vectors.html) models described in +E. Grave, P. Bojanowski, P. Gupta, A. Joulin, T. Mikolov, +Learning Word Vectors for 157 Languages. +Proceedings of the International Conference on Language Resources and Evaluation, 2018. + +**Inputs** + +- Corpus: A collection of documents. + +**Outputs** + +- Corpus: Corpus with new features appended. + +**Document Embedding** parses ngrams of each document in corpus, obtains embedding +for each ngram using pretrained model for chosen language and obtains one vector for each document by aggregating ngram embeddings using one of offered aggregators. Note that method will work on any ngrams but it will give best results if corpus is preprocessed such that ngrams are words (because model was trained to embed words). + +![](images/Document-Embedding-stamped.png) + +1. Widget parameters: + - Language: + - English: will use model trained on documents in English language + - Slovenian: will use model trained on documents in Slovenian language + - German: will use model trained on documents in German language + - Aggregator: + - Mean: aggregate word embedding into document embedding by averaging them + - Sum: aggregate word embedding into document embedding by summing them + - Max: aggregate word embedding into document embedding by taking element-wise maximum + - Min: aggregate word embedding into document embedding by takin element-wise minimum +2. Cancel current execution. +3. If *Apply automatically* is checked, changes in parameters are sent automatically. Alternatively press *Apply*. + +Examples +-------- + +In first example, we will inspect how the widget works. Load *book-excerpts.tab* using [Corpus](corpus-widget.md) widget and connect it to **Document Embedding**. Check the output data by connecting **Document Embedding** to **Data Table**. We see additional 300 features that we widget has appended. + +![](images/Document-Embedding-Example1.png) + +In the second example we will try to predict document category. We will keep working on *book-excerpts.tab* loaded with [Corpus](corpus-widget.md) widget and sent through [Preprocess Text](preprocesstext.md) with default parameters. Connect **Preprocess Text** to **Document Embedding** to obtain features for predictive modelling. Here we set aggregator to Sum. + +Connect **Document Embedding** to **Test and Score** and also connect learner of choice to the left side of **Test and Score**. We chose SVM and changed kernel to Linear. **Test and Score** will now compute performance of each learner on the input. We can see that we achieved great results. + +Let's now inspect confusion matrix. Connect **Test and Score** to **Confusion Matrix**. +Clicking on *Select Misclassified* will output documents that were misclassified. We can further inspect them by connecting [Corpus Viewer](corpusviewer.md) to **Confusion Matrix**. + +![](images/Document-Embedding-Example2.png) \ No newline at end of file diff --git a/doc/widgets/images/Document-Embedding-Example1.png b/doc/widgets/images/Document-Embedding-Example1.png new file mode 100644 index 000000000..7811645e9 Binary files /dev/null and b/doc/widgets/images/Document-Embedding-Example1.png differ diff --git a/doc/widgets/images/Document-Embedding-Example2.png b/doc/widgets/images/Document-Embedding-Example2.png new file mode 100644 index 000000000..a38c624cf Binary files /dev/null and b/doc/widgets/images/Document-Embedding-Example2.png differ diff --git a/doc/widgets/images/Document-Embedding-stamped.png b/doc/widgets/images/Document-Embedding-stamped.png new file mode 100644 index 000000000..588cd1bf4 Binary files /dev/null and b/doc/widgets/images/Document-Embedding-stamped.png differ diff --git a/orangecontrib/text/tests/test_documentembedder.py b/orangecontrib/text/tests/test_documentembedder.py new file mode 100644 index 000000000..eb5274b4d --- /dev/null +++ b/orangecontrib/text/tests/test_documentembedder.py @@ -0,0 +1,147 @@ +import unittest +from unittest.mock import patch +import asyncio +from numpy.testing import assert_array_equal + +from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder +from orangecontrib.text import Corpus + +PATCH_METHOD = 'httpx.AsyncClient.post' + + +class DummyResponse: + + def __init__(self, content): + self.content = content + +def make_dummy_post(response, sleep=0): + @staticmethod + async def dummy_post(url, headers, data): + await asyncio.sleep(sleep) + return DummyResponse(content=response) + return dummy_post + + +class DocumentEmbedderTest(unittest.TestCase): + + def setUp(self): + self.embedder = DocumentEmbedder() # default params + self.corpus = Corpus.from_file('deerwester') + + def tearDown(self): + self.embedder.clear_cache() + + @patch(PATCH_METHOD) + def test_with_empty_corpus(self, mock): + self.assertEqual(len(self.embedder(self.corpus[:0])), 0) + mock.request.assert_not_called() + mock.get_response.assert_not_called() + self.assertEqual(self.embedder._embedder._cache._cache_dict, dict()) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) + def test_success_subset(self): + res = self.embedder(self.corpus[[0]]) + assert_array_equal(res.X, [[0.3, 1]]) + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) + def test_success_shapes(self): + res = self.embedder(self.corpus) + self.assertEqual(res.X.shape, (len(self.corpus), 2)) + self.assertEqual(len(res.domain), len(self.corpus.domain) + 2) + + @patch(PATCH_METHOD, make_dummy_post(b'')) + def test_empty_response(self): + with self.assertWarns(RuntimeWarning): + res = self.embedder(self.corpus[[0]]) + self.assertEqual(res.X.shape, (0, 0)) + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) + + @patch(PATCH_METHOD, make_dummy_post(b'str')) + def test_invalid_response(self): + with self.assertWarns(RuntimeWarning): + res = self.embedder(self.corpus[[0]]) + self.assertEqual(res.X.shape, (0, 0)) + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embeddings": [0.3, 1]}')) + def test_invalid_json_key(self): + with self.assertWarns(RuntimeWarning): + res = self.embedder(self.corpus[[0]]) + self.assertEqual(res.X.shape, (0, 0)) + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) + def test_persistent_caching(self): + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) + self.embedder(self.corpus[[0]]) + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) + self.embedder._embedder._cache.persist_cache() + + self.embedder = DocumentEmbedder() + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) + + self.embedder.clear_cache() + self.embedder = DocumentEmbedder() + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) + def test_cache_for_different_languages(self): + embedder = DocumentEmbedder(language='sl') + embedder.clear_cache() + self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) + embedder(self.corpus[[0]]) + self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) + embedder._embedder._cache.persist_cache() + + self.embedder = DocumentEmbedder() + self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) + self.embedder._embedder._cache.persist_cache() + + embedder = DocumentEmbedder(language='sl') + self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) + embedder.clear_cache() + self.embedder.clear_cache() + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) + def test_cache_for_different_aggregators(self): + embedder = DocumentEmbedder(aggregator='max') + embedder.clear_cache() + self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) + embedder(self.corpus[[0]]) + self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) + embedder._embedder._cache.persist_cache() + + embedder = DocumentEmbedder(aggregator='min') + self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) + embedder(self.corpus[[0]]) + self.assertEqual(len(embedder._embedder._cache._cache_dict), 2) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) + def test_with_statement(self): + with self.embedder as embedder: + res = embedder(self.corpus[[0]]) + assert_array_equal(res.X, [[0.3, 1]]) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) + def test_cancel(self): + self.assertFalse(self.embedder._embedder._cancelled) + self.embedder._embedder._cancelled = True + with self.assertRaises(Exception): + self.embedder(self.corpus[[0]]) + + @patch(PATCH_METHOD, side_effect=OSError) + def test_connection_error(self, _): + embedder = DocumentEmbedder() + with self.assertRaises(ConnectionError): + embedder(self.corpus[[0]]) + + def test_invalid_parameters(self): + with self.assertRaises(ValueError): + self.embedder = DocumentEmbedder(language='eng') + with self.assertRaises(ValueError): + self.embedder = DocumentEmbedder(aggregator='average') + + def test_invalid_corpus_type(self): + with self.assertRaises(ValueError): + self.embedder(self.corpus[0]) diff --git a/orangecontrib/text/vectorization/document_embedder.py b/orangecontrib/text/vectorization/document_embedder.py new file mode 100644 index 000000000..821ef7a54 --- /dev/null +++ b/orangecontrib/text/vectorization/document_embedder.py @@ -0,0 +1,193 @@ +"""This module contains classes used for embedding documents +into a vector space. +""" +import zlib +import base64 +import json +import sys +import warnings +from typing import Tuple, Any, Optional +import numpy as np + +from Orange.misc.server_embedder import ServerEmbedderCommunicator +from orangecontrib.text import Corpus + + +AGGREGATORS = ['Mean', 'Sum', 'Max', 'Min'] +AGGREGATORS_L = ['mean', 'sum', 'max', 'min'] +LANGS_TO_ISO = {'English': 'en', 'Slovenian': 'sl', 'German': 'de'} +LANGUAGES = list(LANGS_TO_ISO.values()) + + +class DocumentEmbedder: + """This class is used for obtaining dense embeddings of documents in + corpus using fastText pretrained models from: + E. Grave, P. Bojanowski, P. Gupta, A. Joulin, T. Mikolov, + Learning Word Vectors for 157 Languages. + Proceedings of the International Conference on Language Resources and Evaluation, 2018. + + Embedding is performed on server so the internet connection is a + prerequisite for using the class. Currently supported languages are: + - English (en) + - Slovenian (sl) + - German (de) + + Attributes + ---------- + language : str + ISO 639-1 (two-letter) code of desired language. + aggregator : str + Aggregator which creates document embedding (single + vector) from word embeddings (multiple vectors). + Allowed values are Mean, Sum, Max, Min. + """ + + def __init__(self, language: str = 'en', + aggregator: str = 'Mean') -> None: + lang_error = '{} is not a valid language. Allowed values: {}' + agg_error = '{} is not a valid aggregator. Allowed values: {}' + if language.lower() not in LANGUAGES: + raise ValueError(lang_error.format(language, ', '.join(LANGUAGES))) + self.language = language.lower() + if aggregator.lower() not in AGGREGATORS_L: + raise ValueError(agg_error.format(aggregator, ', '.join(AGGREGATORS))) + self.aggregator = aggregator.lower() + + self._embedder = _ServerEmbedder(self.aggregator, + model_name='fasttext-'+self.language, + max_parallel_requests=100, + server_url='https://example.com', + # TODO set proper url + embedder_type='text') + + def __call__(self, corpus: Corpus, copy: bool = True, + processed_callback=None) -> Corpus: + """Adds matrix of document embeddings to a corpus. + + Parameters + ---------- + corpus : Corpus + Corpus on which transform is performed. + copy : bool + If set to True, a copy of corpus is made. + + Returns + ------- + Corpus + Corpus (original or a copy) with new features added. + + Raises + ------ + ValueError + If corpus is not instance of Corpus. + RuntimeError + If document in corpus is larger than + 50 KB after compression. + """ + if not isinstance(corpus, Corpus): + raise ValueError("Input should be instance of Corpus.") + corpus = corpus.copy() if copy else corpus + embs = self._embedder.embedd_data( + list(corpus.ngrams), + processed_callback=processed_callback) + + dim = None + send_warning = False + for emb in embs: # find embedding dimension + if emb is not None: + dim = len(emb) + break + # Check if some documents in corpus in weren't embedded + # for some reason. This is a very rare case. + inds = list() + for i, emb in enumerate(embs): + if emb is not None: + inds.append(i) + else: + embs[i] = np.zeros(dim) * np.nan + send_warning = True + + variable_attrs = { + 'hidden': True, + 'skip-normalization': True, + 'document-embedding-feature': True + } + embs = np.array(embs) + new_corpus = corpus[inds] + + if len(inds) > 0: + # if at least one embedding is not None, + # extend attributes + new_corpus.extend_attributes( + np.array(embs[inds]), + ['Dim{}'.format(i) for i in range(dim)], + var_attrs=variable_attrs) + + if send_warning: + warnings.warn(("Some documents were not embedded for " + + "unknown reason. Those documents " + + "are skipped."), + RuntimeWarning) + + return new_corpus + + def report(self) -> Tuple[Tuple[str, str], Tuple[str, str]]: + """Reports on current parameters of DocumentEmbedder. + + Returns + ------- + tuple + Tuple of parameters. + """ + return (('Language', self.language), + ('Aggregator', self.aggregator)) + + def set_cancelled(self): + """Cancels current embedding process""" + if hasattr(self, '_embedder'): + self._embedder.set_cancelled() + + def clear_cache(self): + """Clears embedder cache""" + if self._embedder: + self._embedder.clear_cache() + + def __enter__(self): + return self + + def __exit__(self, ex_type, value, traceback): + self.set_cancelled() + + def __del__(self): + self.__exit__(None, None, None) + + +class _ServerEmbedder(ServerEmbedderCommunicator): + def __init__(self, aggregator: str, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.content_type = 'application/json' + self.aggregator = aggregator + + async def _encode_data_instance(self, data_instance: Any) -> Optional[bytes]: + data_string = json.dumps(list(data_instance)) + data = base64.b64encode(zlib.compress( + data_string.encode('utf-8', 'replace'), + level=-1)).decode('utf-8', 'replace') + + if sys.getsizeof(data) > 50000: + raise RuntimeError("Document in corpus is too large. \ + Size limit is 50 KB (after compression).") + + data_dict = { + "data": data, + "aggregator": self.aggregator + } + + json_string = json.dumps(data_dict) + return json_string.encode('utf-8', 'replace') + + +if __name__ == '__main__': + with DocumentEmbedder(language='en', aggregator='Max') as embedder: + embedder.clear_cache() + embedder(Corpus.from_file('deerwester')) diff --git a/orangecontrib/text/widgets/owdocumentembedding.py b/orangecontrib/text/widgets/owdocumentembedding.py new file mode 100644 index 000000000..615d381a9 --- /dev/null +++ b/orangecontrib/text/widgets/owdocumentembedding.py @@ -0,0 +1,232 @@ +from typing import Any +import numpy as np + +from AnyQt.QtWidgets import QPushButton, QStyle, QLayout +from AnyQt.QtCore import Qt, QSize + +from Orange.widgets.gui import widgetBox, comboBox, auto_commit, hBox +from Orange.widgets.settings import Setting +from Orange.widgets.widget import OWWidget, Msg, Input, Output +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState + +from Orange.misc.utils.embedder_utils import EmbeddingConnectionError + +from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder +from orangecontrib.text.vectorization.document_embedder import LANGS_TO_ISO, AGGREGATORS +from orangecontrib.text.corpus import Corpus + + +def run_pretrained_embedder(corpus: Corpus, + language: str, + aggregator: str, + state: TaskState) -> Corpus: + + """Runs DocumentEmbedder. + + Parameters + ---------- + corpus : Corpus + Corpus on which transform is performed. + language : str + ISO 639-1 (two-letter) code of desired language. + aggregator : str + Aggregator which creates document embedding (single + vector) from word embeddings (multiple vectors). + Allowed values are mean, sum, max, min. + state : TaskState + State object. + + Returns + ------- + Corpus + New corpus with additional features. + """ + + embedder = DocumentEmbedder(language=language, + aggregator=aggregator) + + ticks = iter(np.linspace(0., 100., len(corpus))) + + def advance(success=True): + if state.is_interruption_requested(): + embedder.set_cancelled() + if success: + state.set_progress_value(next(ticks)) + + new_corpus = embedder(corpus, processed_callback=advance) + return new_corpus + + +class OWDocumentEmbedding(OWWidget, ConcurrentWidgetMixin): + name = "Document Embedding" + description = "Document embedding using pretrained models." + keywords = ['embedding', 'document embedding', 'text'] + icon = 'icons/TextEmbedding.svg' + priority = 300 + + want_main_area = False + _auto_apply = Setting(default=True) + + class Inputs: + corpus = Input('Corpus', Corpus) + + class Outputs: + new_corpus = Output('Corpus', Corpus) + + class Error(OWWidget.Error): + no_connection = Msg("""No internet connection. + Please establish a connection or + use another vectorizer.""") + unexpected_error = Msg('Embedding error: {}') + + class Warning(OWWidget.Warning): + unsuccessful_embeddings = Msg('Some embeddings were unsuccessful.') + + language = Setting(default=0) + aggregator = Setting(default=0) + + def __init__(self): + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) + + self.languages = list(LANGS_TO_ISO.keys()) + self.aggregators = AGGREGATORS + self.corpus = None + self.new_corpus = None + self._setup_layout() + + @staticmethod + def sizeHint(): + return QSize(300, 300) + + def _setup_layout(self): + self.controlArea.setMinimumWidth(self.sizeHint().width()) + self.layout().setSizeConstraint(QLayout.SetFixedSize) + + widget_box = widgetBox(self.controlArea, 'Settings') + + self.language_cb = comboBox(widget=widget_box, + master=self, + value='language', + label='Language: ', + orientation=Qt.Horizontal, + items=self.languages, + callback=self._option_changed) + + self.aggregator_cb = comboBox(widget=widget_box, + master=self, + value='aggregator', + label='Aggregator: ', + orientation=Qt.Horizontal, + items=self.aggregators, + callback=self._option_changed) + + self.auto_commit_widget = auto_commit(widget=self.controlArea, + master=self, + value='_auto_apply', + label='Apply', + commit=self.commit) + + self.cancel_button = QPushButton( + 'Cancel', + icon=self.style() + .standardIcon(QStyle.SP_DialogCancelButton)) + + self.cancel_button.clicked.connect(self.cancel) + + hbox = hBox(self.controlArea) + hbox.layout().addWidget(self.cancel_button) + self.cancel_button.setDisabled(True) + + def set_input_corpus_summary(self, corpus): + if corpus is None: + self.info.set_input_summary(self.info.NoInput) + else: + self.info.set_input_summary(str(len(corpus)), "{} documents." + .format(len(corpus))) + + def set_output_corpus_summary(self, corpus): + if corpus is None: + self.info.set_output_summary(self.info.NoOutput) + else: + unsuccessful = len(self.corpus) - len(corpus) + if unsuccessful > 0: + self.Warning.unsuccessful_embeddings() + self.info.set_output_summary( + str(int(len(corpus))), + "Successful: {}, Unsuccessful: {}".format( + int(len(corpus)), int(unsuccessful))) + + @Inputs.corpus + def set_data(self, data): + self.Warning.clear() + self.set_input_corpus_summary(data) + + if not data: + self.corpus = None + self.clear_outputs() + return + + self.corpus = data + self.commit() + + def _option_changed(self): + self.commit() + + def commit(self): + if self.corpus is None: + self.clear_outputs() + return + + self._set_fields(False) + + self.start(run_pretrained_embedder, + self.corpus, + LANGS_TO_ISO[self.languages[self.language]], + self.aggregators[self.aggregator]) + + self.Error.clear() + + def on_done(self, result: Any) -> None: + self._set_fields(True) + self._send_output_signals(result) + + def on_partial_result(self, result: Any): + self.cancel() + self.Error.no_connection() + + def on_exception(self, ex: Exception): + self._set_fields(False) + if isinstance(ex, EmbeddingConnectionError): + self.Error.no_connection() + else: + self.Error.unexpected_error(type(ex).__name__) + self.cancel() + self.clear_outputs() + + + def cancel(self): + self._set_fields(True) + super().cancel() + + def _set_fields(self, active): + self.auto_commit_widget.setDisabled(not active) + self.cancel_button.setDisabled(active) + self.language_cb.setDisabled(not active) + self.aggregator_cb.setDisabled(not active) + + def _send_output_signals(self, result): + self.Outputs.new_corpus.send(result) + self.set_output_corpus_summary(result) + + def clear_outputs(self): + self._send_output_signals(None) + + def onDeleteWidget(self): + self.cancel() + super().onDeleteWidget() + + +if __name__ == '__main__': + from orangewidget.utils.widgetpreview import WidgetPreview + WidgetPreview(OWDocumentEmbedding).run(Corpus.from_file('book-excerpts')) diff --git a/orangecontrib/text/widgets/tests/test_owdocumentembedding.py b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py new file mode 100644 index 000000000..e33c12263 --- /dev/null +++ b/orangecontrib/text/widgets/tests/test_owdocumentembedding.py @@ -0,0 +1,90 @@ +from unittest.mock import Mock, patch + +from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.tests.utils import simulate +from Orange.misc.utils.embedder_utils import EmbeddingConnectionError + +from orangecontrib.text.tests.test_documentembedder import PATCH_METHOD, make_dummy_post +from orangecontrib.text.widgets.owdocumentembedding import OWDocumentEmbedding +from orangecontrib.text import Corpus + + +class TestOWDocumentEmbedding(WidgetTest): + + def setUp(self): + self.widget = self.create_widget(OWDocumentEmbedding) + self.corpus = Corpus.from_file('deerwester') + self.larger_corpus = Corpus.from_file('book-excerpts') + + def test_input(self): + set_data = self.widget.set_data = Mock() + self.send_signal("Corpus", None) + set_data.assert_called_with(None) + self.send_signal("Corpus", self.corpus[:0]) + set_data.assert_called_with(self.corpus[:0]) + self.send_signal("Corpus", self.corpus) + set_data.assert_called_with(self.corpus) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [1.3, 1]}')) + def test_output(self): + self.send_signal("Corpus", None) + self.assertIsNone(self.get_output(self.widget.Outputs.new_corpus)) + + self.send_signal("Corpus", self.corpus) + self.wait_until_finished() + result = self.get_output(self.widget.Outputs.new_corpus) + self.assertIsNotNone(result) + self.assertIsInstance(result, Corpus) + self.assertEqual(len(self.corpus), len(result)) + + def test_input_summary(self): + input_summary = self.widget.info.set_input_summary = Mock() + self.send_signal("Corpus", None) + input_summary.assert_called_with(self.widget.info.NoInput) + + self.send_signal("Corpus", self.corpus) + input_summary.assert_called_with(str(len(self.corpus)), "9 documents.") + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [1.3, 1]}')) + def test_output_summary(self): + output_summary = self.widget.info.set_output_summary = Mock() + self.send_signal("Corpus", self.corpus) + self.wait_until_finished() + output_summary.assert_called_with( + str(int(len(self.corpus))), + "Successful: {}, Unsuccessful: {}".format( + int(len(self.corpus)), int(0))) + + @patch(PATCH_METHOD, make_dummy_post(b'')) + def test_some_failed(self): + simulate.combobox_activate_index(self.widget.controls.aggregator, 1) + self.send_signal("Corpus", self.corpus) + self.wait_until_finished() + result = self.get_output(self.widget.Outputs.new_corpus) + self.assertGreater(len(self.corpus), len(result)) + self.assertTrue(self.widget.Warning.unsuccessful_embeddings.is_shown()) + + @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [1.3, 1]}')) + def test_cancel_embedding(self): + self.send_signal("Corpus", self.larger_corpus) + self.widget.cancel_button.click() + self.wait_until_finished() + self.assertIsNone(self.get_output(self.widget.Outputs.new_corpus)) + + @patch('orangecontrib.text.vectorization.document_embedder' + + '.DocumentEmbedder.__call__', + side_effect=EmbeddingConnectionError) + def test_connection_error(self, _): + self.send_signal("Corpus", self.corpus) + self.wait_until_finished() + self.assertIsNone(self.get_output(self.widget.Outputs.new_corpus)) + self.assertTrue(self.widget.Error.no_connection.is_shown()) + + @patch('orangecontrib.text.vectorization.document_embedder' + + '.DocumentEmbedder.__call__', + side_effect=OSError) + def test_unexpected_error(self, _): + self.send_signal("Corpus", self.corpus) + self.wait_until_finished() + self.assertIsNone(self.get_output(self.widget.Outputs.new_corpus)) + self.assertTrue(self.widget.Error.unexpected_error.is_shown()) diff --git a/requirements.txt b/requirements.txt index a9aa22e95..01f12ddca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ docutils<0.16 # denpendency for botocore python-dateutil<2.8.1 # denpendency for botocore gensim>=0.12.3 # LDA's show topics unified in 0.12.3 setuptools-git -Orange3 >=3.21.0 +Orange3 >=3.25.0 tweepy beautifulsoup4 simhash