From 2f6d3f92270d6c5c6a8015c81191e4587f6aea48 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 16 Sep 2022 16:04:41 +0200 Subject: [PATCH] Speedup corpus viewer --- orangecontrib/text/widgets/owcorpusviewer.py | 819 +++++++++++------- .../text/widgets/tests/test_owcorpusviewer.py | 280 +++++- 2 files changed, 760 insertions(+), 339 deletions(-) diff --git a/orangecontrib/text/widgets/owcorpusviewer.py b/orangecontrib/text/widgets/owcorpusviewer.py index 44c56fd24..8ecae15f4 100644 --- a/orangecontrib/text/widgets/owcorpusviewer.py +++ b/orangecontrib/text/widgets/owcorpusviewer.py @@ -1,27 +1,291 @@ import os import re import sre_constants -from itertools import chain -from typing import Set +from typing import Any, Iterable, List, Set +import numpy as np from AnyQt.QtCore import ( - Qt, QUrl, QItemSelection, QItemSelectionModel, QItemSelectionRange + QAbstractListModel, + QEvent, + QItemSelection, + QItemSelectionModel, + QItemSelectionRange, + QModelIndex, + QSortFilterProxyModel, + Qt, + QUrl, ) - -from AnyQt.QtGui import QStandardItemModel, QStandardItem -from AnyQt.QtWidgets import (QListView, QSizePolicy, QTableView, - QAbstractItemView, QHeaderView, QSplitter, - QApplication) - -from Orange.data.domain import filter_visible +from AnyQt.QtWidgets import ( + QAbstractItemView, + QApplication, + QHeaderView, + QListView, + QSizePolicy, + QSplitter, + QTableView, +) +from Orange.data import Variable +from Orange.data.domain import Domain, filter_visible from Orange.widgets import gui -from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler +from Orange.widgets.settings import ContextSetting, Setting, DomainContextHandler from Orange.widgets.utils.annotated_data import create_annotated_table -from Orange.widgets.widget import OWWidget, Msg, Input, Output +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState +from Orange.widgets.utils.itemmodels import DomainModel +from Orange.widgets.widget import Input, Msg, Output, OWWidget +from orangecanvas.gui.utils import disconnected +from orangewidget.utils.listview import ListViewSearch + from orangecontrib.text.corpus import Corpus +HTML = """ + + + + + + + + + + +{} + + +""" +SEPARATOR = ( + '' +) + -class OWCorpusViewer(OWWidget): +def _count_matches(content: List[str], search_string: str, state: TaskState) -> int: + """ + Count number of appears of any terms in search_string in content texts. + + Parameters + ---------- + content + List of texts where we count appearances + search_string + Strings that are searched in texts. This parameter has a format + term1|term2|term3|... + + Returns + ------- + Number of all matches of search_string in all texts in content list + """ + matches = 0 + if search_string: + regex = re.compile(search_string.strip("|"), re.IGNORECASE) + for i, text in enumerate(content): + matches += len(regex.findall(text)) + state.set_progress_value((i + 1) / len(content) * 100) + return matches + + +class DocumentListModel(QAbstractListModel): + """ + Custom model for listing documents. Using custom model since Onrage's + pylistmodel is too slow for large number of documents + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__visible_data = [] + self.__filter_content = [] + + def data(self, index: QModelIndex, role: int = Qt.DisplayRole) -> Any: + if role == Qt.DisplayRole: + return self.__visible_data[index.row()] + elif role == Qt.UserRole: + return self.__filter_content[index.row()] + + def rowCount(self, parent: QModelIndex = None, *args, **kwargs) -> int: + return len(self.__visible_data) + + def setup_data(self, data: List[str], content: List[str]): + self.beginResetModel() + self.__visible_data = data + self.__filter_content = content + self.endResetModel() + + def update_filter_content(self, content: List[str]): + assert len(content) == len(self.__visible_data) + self.__filter_content = content + + def get_filter_content(self) -> List[str]: + return self.__filter_content + + def clear(self): + self.beginResetModel() + self.__visible_data = [] + self.__filter_content = [] + self.endResetModel() + + +class DocumentsFilterProxyModel(QSortFilterProxyModel): + """Filter model for documents list""" + + __regex = None + + def set_filter_string(self, filter_string: str): + self.__regex = re.compile(filter_string.strip("|"), re.IGNORECASE) + self.invalidateFilter() + + def filterAcceptsRow(self, source_row: int, source_parent: QModelIndex) -> bool: + """Filter document that mathc the filter string""" + if self.__regex is None: + # filter is not defined yet - show all + return True + else: + index = self.sourceModel().index(source_row, 0, source_parent) + content = self.sourceModel().data(index, Qt.UserRole) + res = self.__regex.search(content) + return bool(res) + + +class DocumentTableView(QTableView): + """TableView that disables unselecting all items""" + + def selectionCommand( + self, index: QModelIndex, event: QEvent = None + ) -> QItemSelectionModel.SelectionFlags: + flags = super().selectionCommand(index, event) + selmodel = self.selectionModel() + if not index.isValid(): # Click on empty viewport; don't clear + return QItemSelectionModel.NoUpdate + if selmodel.isSelected(index): + currsel = selmodel.selectedIndexes() + if len(currsel) == 1 and index == currsel[0]: + # Is the last selected index; do not deselect it + return QItemSelectionModel.NoUpdate + if ( + event is not None + and event.type() == QEvent.MouseMove + and flags & QItemSelectionModel.ToggleCurrent + ): + # Disable ctrl drag 'toggle'; can be made to deselect the last + # index, would need to keep track of the current selection + # (selectionModel does this but does not expose it) + flags &= ~QItemSelectionModel.Toggle + flags |= QItemSelectionModel.Select + return flags + + +class VariableListViewSearch(ListViewSearch): + """ListViewSearch that disables unselecting all items in the list""" + + def selectionCommand( + self, index: QModelIndex, event: QEvent = None + ) -> QItemSelectionModel.SelectionFlags: + flags = super().selectionCommand(index, event) + selmodel = self.selectionModel() + if not index.isValid(): # Click on empty viewport; don't clear + return QItemSelectionModel.NoUpdate + if selmodel.isSelected(index): + currsel = selmodel.selectedIndexes() + if len(currsel) == 1 and index == currsel[0]: + # Is the last selected index; do not deselect it + return QItemSelectionModel.NoUpdate + if ( + event is not None + and event.type() == QEvent.MouseMove + and flags & QItemSelectionModel.ToggleCurrent + ): + # Disable ctrl drag 'toggle'; can be made to deselect the last + # index, would need to keep track of the current selection + # (selectionModel does this but does not expose it) + flags &= ~QItemSelectionModel.Toggle + flags |= QItemSelectionModel.Select + return flags + + def set_selection(self, items: Iterable[Variable]): + """Set selected items in the list view""" + model = self.model() + values = self.model()[:] + items = [it for it in items if it in values] + selection = QItemSelection() + if items: + for val in items: + index = values.index(val) + selection.merge( + QItemSelection(model.index(index, 0), model.index(index, 0)), + QItemSelectionModel.Select, + ) + self.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect) + + +class VisibleDomainModel(DomainModel): + """Domain model that filter only visible features""" + + def set_domain(self, domain): + if domain is not None: + domain = Domain( + filter_visible(domain.attributes), + class_vars=filter_visible(domain.class_vars), + metas=filter_visible(domain.metas), + ) + super().set_domain(domain) + + +class OWCorpusViewer(OWWidget, ConcurrentWidgetMixin): name = "Corpus Viewer" description = "Display corpus contents." icon = "icons/CorpusViewer.svg" @@ -35,74 +299,78 @@ class Outputs: other_docs = Output("Other Docs", Corpus) corpus = Output("Corpus", Corpus) - settingsHandler = PerfectDomainContextHandler( - match_values = PerfectDomainContextHandler.MATCH_VALUES_ALL - ) + settingsHandler = DomainContextHandler() - search_indices = ContextSetting([], exclude_metas=False) # features included in search - display_indices = ContextSetting([], exclude_metas=False) # features for display - display_features = ContextSetting([], exclude_metas=False) - selected_documents = ContextSetting([]) + settings_version = 2 + search_features: List[Variable] = ContextSetting([]) + display_features: List[Variable] = ContextSetting([]) + selected_documents: Set[int] = Setting({0}, schema_only=True) regexp_filter = ContextSetting("") - show_tokens = Setting(False) autocommit = Setting(True) class Warning(OWWidget.Warning): - no_feats_search = Msg('No features included in search.') - no_feats_display = Msg('No features selected for display.') + no_feats_search = Msg("No features included in search.") + no_feats_display = Msg("No features selected for display.") def __init__(self): super().__init__() + ConcurrentWidgetMixin.__init__(self) - self.corpus = None # Corpus - self.corpus_docs = None # Documents generated from Corpus - self.doc_webview = None # WebView for showing content - self.search_features = [] # two copies are needed since Display allows drag & drop - self.display_list_indices = [0] - self.matches = 0 # Matches of the query + self.corpus = None # Corpus + self.__pending_selected_documents = self.selected_documents # Info attributes self.update_info() - info_box = gui.widgetBox(self.controlArea, 'Info') - gui.label(info_box, self, 'Tokens: %(n_tokens)s') - gui.label(info_box, self, 'Types: %(n_types)s') - gui.label(info_box, self, 'Matching documents: %(n_matching)s') - gui.label(info_box, self, 'Matches: %(n_matches)s') + info_box = gui.widgetBox(self.controlArea, "Info") + gui.label(info_box, self, "Tokens: %(n_tokens)s") + gui.label(info_box, self, "Types: %(n_types)s") + gui.label(info_box, self, "Matching documents: %(n_matching)s") + gui.label(info_box, self, "Matches: %(n_matches)s") # Search features - self.search_listbox = gui.listBox( - self.controlArea, self, 'search_indices', 'search_features', - selectionMode=QListView.ExtendedSelection, - box='Search features', callback=self.search_features_changed) + ex_sel = QListView.ExtendedSelection + search_box = gui.widgetBox(self.controlArea, "Search features") + self.search_listbox = sl = VariableListViewSearch(selectionMode=ex_sel) + search_box.layout().addWidget(sl) + sl.setModel(VisibleDomainModel(separators=False)) + sl.selectionModel().selectionChanged.connect(self.search_features_changed) # Display features - display_box = gui.widgetBox(self.controlArea, 'Display features') - self.display_listbox = gui.listBox( - display_box, self, 'display_list_indices', 'display_features', - selectionMode=QListView.ExtendedSelection, - callback=self.show_docs, enableDragDrop=True) - self.show_tokens_checkbox = gui.checkBox(display_box, self, 'show_tokens', - 'Show Tokens && Tags', callback=self.show_docs) + display_box = gui.widgetBox(self.controlArea, "Display features") + self.display_listbox = dl = VariableListViewSearch(selectionMode=ex_sel) + display_box.layout().addWidget(dl) + dl.setModel(VisibleDomainModel(separators=False)) + dl.selectionModel().selectionChanged.connect(self.display_features_changed) + + self.show_tokens_checkbox = gui.checkBox( + display_box, + self, + "show_tokens", + "Show Tokens && Tags", + callback=self.show_docs, + ) # Auto-commit box - gui.auto_commit(self.controlArea, self, 'autocommit', 'Send data', 'Auto send is on') + gui.auto_commit( + self.controlArea, self, "autocommit", "Send data", "Auto send is on" + ) # Search - self.filter_input = gui.lineEdit(self.mainArea, self, 'regexp_filter', - orientation=Qt.Horizontal, - sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, - QSizePolicy.Fixed), - label='RegExp Filter:', - callback=self.refresh_search) - - # Main area - self.splitter = QSplitter( + self.filter_input = gui.lineEdit( + self.mainArea, + self, + "regexp_filter", orientation=Qt.Horizontal, - childrenCollapsible=False, + sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed), + label="RegExp Filter:", + callback=self.refresh_search, ) + + # Main area + self.splitter = QSplitter(orientation=Qt.Horizontal, childrenCollapsible=False) # Document list - self.doc_list = QTableView() + self.doc_list = DocumentTableView() self.doc_list.setSelectionBehavior(QTableView.SelectRows) self.doc_list.setSelectionMode(QTableView.ExtendedSelection) self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers) @@ -110,11 +378,11 @@ def __init__(self): self.doc_list.horizontalHeader().setVisible(False) self.splitter.addWidget(self.doc_list) - self.doc_list_model = QStandardItemModel(self) - self.doc_list.setModel(self.doc_list_model) - self.doc_list.selectionModel().selectionChanged.connect( - self.selection_changed - ) + self.doc_list_model = DocumentListModel() + proxy_model = DocumentsFilterProxyModel() + proxy_model.setSourceModel(self.doc_list_model) + self.doc_list.setModel(proxy_model) + self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed) # Document contents self.doc_webview = gui.WebviewWidget(self.splitter, debug=False) @@ -129,23 +397,11 @@ def set_data(self, corpus=None): self.closeContext() self.reset_widget() self.corpus = corpus - self.search_features = [] if corpus is not None: - domain = self.corpus.domain - # Enable/disable tokens checkbox - if not self.corpus.has_tokens(): - self.show_tokens_checkbox.setCheckState(Qt.Unchecked) - self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens()) - - self.search_features = list(filter_visible(chain(domain.variables, domain.metas))) - self.display_features = list(filter_visible(chain(domain.variables, domain.metas))) - self.search_indices = list(range(len(self.search_features))) - self.display_indices = list(range(len(self.display_features))) - self.selected_documents = [corpus.titles[0]] if \ - corpus.titles is not None and len(corpus.titles) else [] + self.setup_controls() self.openContext(self.corpus) - self.display_list_indices = self.display_indices - self.regenerate_docs() + self.doc_list.model().set_filter_string(self.regexp_filter) + self.select_variables() self.list_docs() self.update_info() self.set_selection() @@ -155,219 +411,138 @@ def set_data(self, corpus=None): def reset_widget(self): # Corpus self.corpus = None - self.corpus_docs = None - self.display_features = [] # Widgets - self.search_listbox.clear() - self.display_listbox.clear() + self.search_listbox.model().set_domain(None) + self.display_listbox.model().set_domain(None) self.filter_input.clear() self.update_info() # Models/vars - self.search_features.clear() - self.search_indices.clear() - self.display_indices.clear() self.doc_list_model.clear() # Warnings self.Warning.clear() # WebView - self.doc_webview.setHtml('') + self.doc_webview.setHtml("") + + def setup_controls(self): + """Setup controls in control area""" + domain = self.corpus.domain + if not self.corpus.has_tokens(): + self.show_tokens_checkbox.setCheckState(Qt.Unchecked) + self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens()) + self.search_listbox.model().set_domain(domain) + self.display_listbox.model().set_domain(domain) + self.search_features = self.search_listbox.model()[:] + self.display_features = self.display_listbox.model()[:] + + def select_variables(self): + """Set selection to display and search features view boxes""" + smodel = self.search_listbox.model() + dmodel = self.display_listbox.model() + # it can happen that domain handler will set some features that are + # not part of domain - remove them + self.search_features = [f for f in self.search_features if f in smodel] + self.display_features = [f for f in self.display_features if f in dmodel] + # if no features after removing non-existent, select all - default + if not self.search_features: + self.search_features = smodel[:] + if not self.display_features: + self.display_features = dmodel[:] + with disconnected( + self.search_listbox.selectionModel().selectionChanged, + self.search_features_changed, + ): + self.search_listbox.set_selection(self.search_features) + with disconnected( + self.display_listbox.selectionModel().selectionChanged, + self.display_features_changed, + ): + self.display_listbox.set_selection(self.display_features) def list_docs(self): - """ List documents into the left scrolling area """ - if self.corpus_docs is None: - return - # TODO: remove search_keyword?? - search_keyword = self.regexp_filter.strip('|') - matches = 0 - try: - reg = re.compile(search_keyword, re.IGNORECASE) - except sre_constants.error: - return - - self.doc_list_model.clear() + """List documents into the left scrolling area""" + docs = self.regenerate_docs() + self.doc_list_model.setup_data(self.corpus.titles.tolist(), docs) - for i, (doc, title, content) in enumerate(zip(self.corpus, self.corpus.titles, - self.corpus_docs)): - res = len(list(reg.finditer(content))) if self.regexp_filter else 0 - if not self.regexp_filter or res: - matches += res - item = QStandardItem() - item.setData(str(title), Qt.DisplayRole) - item.setData(doc, Qt.UserRole) - self.doc_list_model.appendRow(item) - self.matches = matches - - def get_selected_documents_from_view(self) -> Set[str]: - """ - Returns - ------- - Set with names of selected documents in the QTableView - """ - return { - i.data(Qt.DisplayRole) - for i in self.doc_list.selectionModel().selectedRows() - } + def get_selected_indexes(self) -> Set[int]: + m = self.doc_list.model().mapToSource + return {m(i).row() for i in self.doc_list.selectionModel().selectedRows()} def set_selection(self) -> None: """ Select documents in selected_documents attribute in the view """ + self.selected_documents = self.__pending_selected_documents + self.__pending_selected_documents = {0} view = self.doc_list model = view.model() + source_model = model.sourceModel() - previously_selected = self.selected_documents.copy() selection = QItemSelection() - for row in range(model.rowCount()): - document = model.data(model.index(row, 0), Qt.DisplayRole) - if document in self.selected_documents: - selection.append(QItemSelectionRange( - view.model().index(row, 0), - view.model().index(row, 0) - )) - view.selectionModel().select( - selection, QItemSelectionModel.ClearAndSelect - ) - if len(selection) == 0: - # in cases when selection is empty qt's selection_changed is not - # called and so we need to manually trigger show_docs - self.show_docs() - # select emmit selection change signal which causes calling - # selection_changed when filtering it means that documents which - # are currently filtered out get removed from self.selected_douments - # we still want to keep them to be still selected after user removes - # filter - self.selected_documents = previously_selected + self.selected_documents = { + r for r in self.selected_documents if r < len(self.corpus) + } + for row in self.selected_documents: + index = model.mapFromSource(source_model.index(row, 0)) + selection.append(QItemSelectionRange(index, index)) + # don't emit selection change to avoid double call of commit function + # it is already called from set_data + with disconnected( + self.doc_list.selectionModel().selectionChanged, self.selection_changed + ): + view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect) def selection_changed(self) -> None: - """ - Function is called every time the selection changes - when user select - new range of documents - """ - self.selected_documents = self.get_selected_documents_from_view() + """Function is called every time the selection changes""" + self.selected_documents = self.get_selected_indexes() self.show_docs() self.commit.deferred() def show_docs(self): - """ Show the selected documents in the right area """ - HTML = ''' - - - - - - - - - - - {} - - - ''' - self.display_indices = self.display_list_indices + """Show the selected documents in the right area""" if self.corpus is None: return self.Warning.no_feats_display.clear() - if len(self.display_indices) == 0: + if len(self.display_features) == 0: self.Warning.no_feats_display() if self.show_tokens: tokens = list(self.corpus.ngrams_iterator(include_postags=True)) - marked_search_features = [f for i, f in enumerate(self.search_features) - if i in self.search_indices] - - html = '' - for doc_count, index in enumerate(self.doc_list.selectionModel().selectedRows()): - if doc_count > 0: # add split - html += '' \ - '' - - row_ind = index.data(Qt.UserRole).row_index - for ind in self.display_indices: - feature = self.display_features[ind] - value = str(index.data(Qt.UserRole)[feature.name]) - if feature in marked_search_features: + parts = [] + for doc_count, c_index in enumerate(sorted(self.selected_documents)): + text = "" + for feature in self.display_features: + value = str(self.corpus[c_index, feature.name]) + if feature in self.search_features: value = self.__mark_text(value) - value = value.replace('\n', '
') - is_image = feature.attributes.get('type', '') == 'image' - if is_image and value != '?': + value = value.replace("\n", "
") + is_image = feature.attributes.get("type", "") == "image" + if is_image and value != "?": value = os.path.join(feature.attributes.get("origin", ""), value) value = ''.format(value) - html += '' \ - ''.format( - feature.name, value) + text += ( + f'' + f'' + ) if self.show_tokens: - html += '' \ - ''.format(''.join('{}'.format( - token) for token in tokens[row_ind])) - - html += '
{}:{}
{feature.name}:{value}
Tokens & Tags:{}
' + tokens_ = "".join( + f'{token}' for token in tokens[c_index] + ) + text += ( + f'Tokens & Tags:' + f"{tokens_}" + ) + parts.append(text) + + joined = SEPARATOR.join(parts) + html = f"{joined}
" base = QUrl.fromLocalFile(__file__) self.doc_webview.setHtml(HTML.format(html), base) def __mark_text(self, text): - search_keyword = self.regexp_filter.strip('|') + search_keyword = self.regexp_filter.strip("|") if not search_keyword: return text @@ -382,71 +557,91 @@ def __mark_text(self, text): text = list(text) for m in matches[::-1]: - text[m.start():m.end()] = list('{}'\ - .format("".join(text[m.start():m.end()]))) - + text[m.start() : m.end()] = list( + f'{"".join(text[m.start(): m.end()])}' + ) return "".join(text) + @staticmethod + def __get_selected_rows(view: QListView) -> List[Variable]: + rows = view.selectionModel().selectedRows() + values = view.model()[:] + return [values[row.row()] for row in sorted(rows, key=lambda r: r.row())] + def search_features_changed(self): - self.regenerate_docs() + self.search_features = self.__get_selected_rows(self.search_listbox) + if self.corpus: + self.doc_list_model.update_filter_content(self.regenerate_docs()) + self.doc_list.model().invalidateFilter() self.refresh_search() - def regenerate_docs(self): - self.corpus_docs = None + def display_features_changed(self): + self.display_features = self.__get_selected_rows(self.display_listbox) + self.show_docs() + + def regenerate_docs(self) -> List[str]: self.Warning.no_feats_search.clear() - if self.corpus is not None: - feats = [self.search_features[i] for i in self.search_indices] - if len(feats) == 0: - self.Warning.no_feats_search() - self.corpus_docs = self.corpus.documents_from_features(feats) + if len(self.search_features) == 0: + self.Warning.no_feats_search() + return self.corpus.documents_from_features(self.search_features) def refresh_search(self): if self.corpus is not None: - self.list_docs() - self.set_selection() + self.doc_list.model().set_filter_string(self.regexp_filter) + if not self.selected_documents: + # when currently selected items are filtered selection is empty + # select first element in the view in that case + self.doc_list.setCurrentIndex(self.doc_list.model().index(0, 0)) self.update_info() + self.start( + _count_matches, + self.doc_list_model.get_filter_content(), + self.regexp_filter, + ) + self.show_docs() self.commit.deferred() + def on_done(self, res: int): + """When matches count is done show the result in the label""" + self.n_matches = res if res is not None else "n/a" + + def on_exception(self, ex): + raise ex + def update_info(self): if self.corpus is not None: - self.n_matching = '{}/{}'.format(self.doc_list_model.rowCount(), len(self.corpus)) - self.n_matches = self.matches if self.matches else 'n/a' - self.n_tokens = sum(map(len, self.corpus.tokens)) if self.corpus.has_tokens() else 'n/a' - self.n_types = len(self.corpus.dictionary) if self.corpus.has_tokens() else 'n/a' + has_tokens = self.corpus.has_tokens() + self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.corpus)}" + self.n_tokens = sum(map(len, self.corpus.tokens)) if has_tokens else "n/a" + self.n_types = len(self.corpus.dictionary) if has_tokens else "n/a" else: - self.n_matching = '' - self.n_matches = '' - self.n_tokens = '' - self.n_types = '' + self.n_matching = "n/a" + self.n_matches = "n/a" + self.n_tokens = "n/a" + self.n_types = "n/a" @gui.deferred def commit(self): matched = unmatched = annotated_corpus = None - corpus = self.corpus - if corpus is not None: - # it returns a set of selected documents which are in view - selected_docs = self.get_selected_documents_from_view() - titles = corpus.titles - matched_mask = [ - i for i, t in enumerate(titles) if t in selected_docs - ] - unmatched_mask = [ - i for i, t in enumerate(titles) if t not in selected_docs - ] - - matched = corpus[matched_mask] if len(matched_mask) else None - unmatched = corpus[unmatched_mask] if len(unmatched_mask) else None - annotated_corpus = create_annotated_table(corpus, matched_mask) + if self.corpus is not None: + selected_docs = sorted(self.get_selected_indexes()) + matched = self.corpus[selected_docs] if selected_docs else None + mask = np.ones(len(self.corpus), bool) + mask[selected_docs] = 0 + unmatched = self.corpus[mask] if mask.any() else None + annotated_corpus = create_annotated_table(self.corpus, selected_docs) self.Outputs.matching_docs.send(matched) self.Outputs.other_docs.send(unmatched) self.Outputs.corpus.send(annotated_corpus) def send_report(self): - self.report_items(( - ("Query", self.regexp_filter), - ("Matching documents", self.n_matching), - ("Matches", self.n_matches) - )) + self.report_items( + ( + ("Query", self.regexp_filter), + ("Matching documents", self.n_matching), + ("Matches", self.n_matches), + ) + ) def showEvent(self, event): super().showEvent(event) @@ -460,18 +655,44 @@ def update_splitter(self): """ w1, w2 = self.splitter.sizes() ws = w1 + w2 - if w2 < 2/3 * ws: - self.splitter.setSizes([int(ws * 1/3), int(ws * 2/3)]) - + if w2 < 2 / 3 * ws: + self.splitter.setSizes([int(ws * 1 / 3), int(ws * 2 / 3)]) + + @classmethod + def migrate_context(cls, context, version): + if version < 2: + f_order = context.values.pop("display_features", None) + display_idx = context.values.pop("display_indices", []) + search_ids = context.values.pop("search_indices", []) + if f_order is not None: + f_order = f_order[0] + display_features = [f_order[i] for i in display_idx if i < len(f_order)] + search_features = [f_order[i] for i in search_ids if i < len(f_order)] + context.values["display_features"] = (display_features, -3) + context.values["search_features"] = (search_features, -3) + + # old widget used PerfectDomainContextHandler with MATCH_VALUES_ALL + # now it uses DomainContextHandler. The difference are: + # - perfect handler stores values in tuple while domain in dicts + # - domain context handler store class_vars together with attributes + # while perfect handler store them separately + # - since MATCH_VALUES_ALL was used discrete var values were stored + # with var name (replacing them with id for discrete var - 1) + if hasattr(context, "class_vars"): + context.attributes = { + attr: 1 if isinstance(v, list) else v + for attr, v in context.attributes + context.class_vars + } + context.metas = dict(context.metas) + delattr(context, "class_vars") + + +if __name__ == "__main__": + from orangewidget.utils.widgetpreview import WidgetPreview -if __name__ == '__main__': from orangecontrib.text.preprocess import BASE_TOKENIZER - from orangecontrib.text.tag.pos import AveragedPerceptronTagger - from orangewidget.utils.widgetpreview import WidgetPreview - corpus = Corpus.from_file('book-excerpts') - corpus = corpus[:3] - tagger = AveragedPerceptronTagger() - tagged_corpus = tagger(BASE_TOKENIZER(corpus)) - tagged_corpus.ngram_range = (1, 2) - WidgetPreview(OWCorpusViewer).run(tagged_corpus) + corpus_ = Corpus.from_file("book-excerpts") + corpus_ = corpus_[:3] + corpus_ = BASE_TOKENIZER(corpus_) + WidgetPreview(OWCorpusViewer).run(corpus_) diff --git a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py index 6d1847868..f19057885 100644 --- a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py +++ b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py @@ -1,18 +1,110 @@ import unittest +from unittest import TestCase import numpy as np +from AnyQt.QtCore import QItemSelectionModel, Qt from AnyQt.QtTest import QSignalSpy +from orangewidget.settings import Context + +from Orange.data import StringVariable, Domain from Orange.widgets.tests.base import WidgetTest -from Orange.data import StringVariable from orangecontrib.text.corpus import Corpus -from orangecontrib.text.widgets.owcorpusviewer import OWCorpusViewer +from orangecontrib.text.preprocess import BASE_TOKENIZER +from orangecontrib.text.widgets.owcorpusviewer import ( + OWCorpusViewer, + DocumentListModel, + DocumentsFilterProxyModel, +) + + +class TestDocumentListModel(TestCase): + def test_empty(self): + model = DocumentListModel() + self.assertEqual(model.rowCount(), 0) + self.assertListEqual(model.get_filter_content(), []) + + def test_data(self): + model = DocumentListModel() + documents = ["Doc 1", "Doc 2", "Doc 3"] + contents = ["bar", "foo", "bar foo"] + model.setup_data(documents, contents) + + self.assertListEqual(model.get_filter_content(), contents) + self.assertEqual(model.rowCount(), 3) + + self.assertEqual(model.data(model.index(0)), documents[0]) + self.assertEqual(model.data(model.index(1)), documents[1]) + self.assertEqual(model.data(model.index(2)), documents[2]) + + def test_data_method(self): + model = DocumentListModel() + documents = ["Doc 1", "Doc 2", "Doc 3"] + contents = ["bar", "foo", "bar foo"] + model.setup_data(documents, contents) + + self.assertEqual(model.data(model.index(0), Qt.DisplayRole), documents[0]) + self.assertEqual(model.data(model.index(1), Qt.DisplayRole), documents[1]) + self.assertEqual(model.data(model.index(2), Qt.DisplayRole), documents[2]) + + self.assertEqual(model.data(model.index(0), Qt.UserRole), contents[0]) + self.assertEqual(model.data(model.index(1), Qt.UserRole), contents[1]) + self.assertEqual(model.data(model.index(2), Qt.UserRole), contents[2]) + + self.assertIsNone(model.data(model.index(2), Qt.BackgroundRole)) + + def test_update_filter_content(self): + model = DocumentListModel() + documents = ["Doc 1", "Doc 2", "Doc 3"] + contents = ["bar", "foo", "bar foo"] + model.setup_data(documents, contents) + + model.update_filter_content(["a", "b", "c"]) + self.assertEqual(model.data(model.index(0), Qt.UserRole), "a") + self.assertEqual(model.data(model.index(1), Qt.UserRole), "b") + self.assertEqual(model.data(model.index(2), Qt.UserRole), "c") + + with self.assertRaises(AssertionError): + model.update_filter_content( + [ + "a", + "b", + ] + ) + + +class TestFilterModel(TestCase): + def test_filter_model(self): + model = DocumentListModel() + filter_model = DocumentsFilterProxyModel() + filter_model.setSourceModel(model) + documents = ["Doc 1", "Doc 2", "Doc 3"] + contents = ["bar", "foo", "bar foo"] + model.setup_data(documents, contents) + + # __regex is None - all data shown + self.assertEqual(filter_model.rowCount(), 3) + self.assertEqual(filter_model.data(filter_model.index(0, 0)), documents[0]) + self.assertEqual(filter_model.data(filter_model.index(1, 0)), documents[1]) + self.assertEqual(filter_model.data(filter_model.index(2, 0)), documents[2]) + + # with regex set + filter_model.set_filter_string("bar") + self.assertEqual(filter_model.rowCount(), 2) + self.assertEqual(filter_model.data(filter_model.index(0, 0)), documents[0]) + self.assertEqual(filter_model.data(filter_model.index(1, 0)), documents[2]) + + def test_empty_model(self): + model = DocumentListModel() + filter_model = DocumentsFilterProxyModel() + filter_model.setSourceModel(model) + self.assertEqual(filter_model.rowCount(), 0) class TestCorpusViewerWidget(WidgetTest): def setUp(self): self.widget = self.create_widget(OWCorpusViewer) - self.corpus = Corpus.from_file('deerwester') + self.corpus = Corpus.from_file("deerwester") def test_data(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) @@ -31,23 +123,23 @@ def test_search(self): self.process_events() out_corpus = self.get_output(self.widget.Outputs.matching_docs) self.assertEqual(len(out_corpus), 1) - self.assertEqual(self.widget.matches, 7) + self.assertEqual(self.widget.n_matches, 7) # first document is selected, when filter with word that is not in - # selected document out_corpus is None + # selected document, first of shown documents is selected self.widget.regexp_filter = "graph" self.widget.refresh_search() self.process_events() - out_corpus = self.get_output(self.widget.Outputs.matching_docs) - self.assertIsNone(out_corpus) + self.assertEqual(1, len(self.get_output(self.widget.Outputs.matching_docs))) # word count doesn't depend on selection - self.assertEqual(self.widget.matches, 7) + self.assertEqual(self.widget.n_matches, 7) # when filter is removed, matched words is 0 self.widget.regexp_filter = "" self.widget.refresh_search() self.process_events() - self.assertEqual(self.widget.matches, 0) + self.wait_until_finished() + self.assertEqual(self.widget.n_matches, 0) def test_highlighting(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) @@ -68,15 +160,11 @@ def test_highlighting(self): self.assertIn('', html) def test_highlighting_non_latin(self): - documents = [ - { - 'content': """царстве есть сад с молодильными яблоками""" - } - ] + documents = [{"content": """царстве есть сад с молодильными яблоками"""}] metas = [ - (StringVariable('content'), lambda doc: doc.get('content')), + (StringVariable("content"), lambda doc: doc.get("content")), ] - dataset_name = 'RussianDocument' + dataset_name = "RussianDocument" corpus = Corpus.from_documents(documents, dataset_name, metas=metas) self.send_signal(self.widget.Inputs.corpus, corpus) @@ -90,56 +178,54 @@ def test_highlighting_non_latin(self): self.assertIn('', html) def test_output(self): - """ Output is intersection between selection and filter """ + """Output is intersection between selection and filter""" self.send_signal(self.widget.Inputs.corpus, self.corpus) self.widget.regexp_filter = "graph" self.widget.refresh_search() self.process_events() - self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs)) + # when intersection is empty automatically select first document shown + mathing = self.get_output(self.widget.Outputs.matching_docs) + self.assertEqual(1, len(mathing)) self.assertEqual( - 9, len(self.get_output(self.widget.Outputs.other_docs)) + mathing.get_column_view("Text")[0][0], + "The generation of random binary unordered trees", ) + self.assertEqual(8, len(self.get_output(self.widget.Outputs.other_docs))) self.assertEqual( len(self.corpus.domain.metas) + 1, - len(self.get_output(self.widget.Outputs.corpus).domain.metas) + len(self.get_output(self.widget.Outputs.corpus).domain.metas), ) self.widget.doc_list.selectAll() # selects current documents in list - self.assertEqual( - 4, len(self.get_output(self.widget.Outputs.matching_docs)) - ) - self.assertEqual( - 5, len(self.get_output(self.widget.Outputs.other_docs)) - ) + self.assertEqual(4, len(self.get_output(self.widget.Outputs.matching_docs))) + self.assertEqual(5, len(self.get_output(self.widget.Outputs.other_docs))) output = self.get_output(self.widget.Outputs.corpus) self.assertEqual( len(self.get_output(self.widget.Outputs.matching_docs)), - sum(output.get_column_view("Selected")[0]) + sum(output.get_column_view("Selected")[0]), ) self.widget.regexp_filter = "human" self.widget.refresh_search() self.process_events() - # empty because none of matching documents is selected - self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs)) + # when intersection is empty automatically select first document shown + mathing = self.get_output(self.widget.Outputs.matching_docs) + self.assertEqual(1, len(mathing)) self.assertEqual( - 9, len(self.get_output(self.widget.Outputs.other_docs)) + mathing.get_column_view("Text")[0][0], + "Human machine interface for lab abc computer applications", ) + self.assertEqual(8, len(self.get_output(self.widget.Outputs.other_docs))) output = self.get_output(self.widget.Outputs.corpus) - self.assertEqual(0, - sum(output.get_column_view("Selected")[0])) + self.assertEqual(1, sum(output.get_column_view("Selected")[0])) self.widget.doc_list.selectAll() - self.assertEqual( - 5, len(self.get_output(self.widget.Outputs.matching_docs)) - ) - self.assertEqual( - 4, len(self.get_output(self.widget.Outputs.other_docs)) - ) + self.assertEqual(5, len(self.get_output(self.widget.Outputs.matching_docs))) + self.assertEqual(4, len(self.get_output(self.widget.Outputs.other_docs))) output = self.get_output(self.widget.Outputs.corpus) self.assertEqual( len(self.get_output(self.widget.Outputs.matching_docs)), - sum(output.get_column_view("Selected")[0]) + sum(output.get_column_view("Selected")[0]), ) self.send_signal(self.widget.Inputs.corpus, None) @@ -149,7 +235,7 @@ def test_output(self): def test_empty_corpus(self): self.send_signal(self.widget.Inputs.corpus, self.corpus[:0]) - self.assertListEqual(self.widget.selected_documents, []) + self.assertSetEqual(self.widget.selected_documents, set()) self.assertEqual(self.widget.doc_list.model().rowCount(), 0) def test_report(self): @@ -159,6 +245,120 @@ def test_report(self): self.process_events() self.widget.send_report() + def test_filter_attributes(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.widget.filter_input.setText("graph") + self.widget.refresh_search() + + # all attributes used for filtering (shown documents with "graph" in Category) + doc_model = self.widget.doc_list.model() + doc_shown = [ + doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount()) + ] + self.assertListEqual( + doc_shown, ["Document 6", "Document 7", "Document 8", "Document 9"] + ) + + # only "Text" used for filtering (shown documents with "graph" in Text) + slv = self.widget.search_listbox + slv.selectionModel().select( + slv.model().index(1), QItemSelectionModel.ClearAndSelect + ) + doc_shown = [ + doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount()) + ] + self.assertListEqual(doc_shown, ["Document 7", "Document 8", "Document 9"]) + + def test_filters_restored_from_context(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.widget.filter_input.setText("graph") + self.widget.refresh_search() + slv = self.widget.search_listbox + slv.selectionModel().select( + slv.model().index(1), QItemSelectionModel.ClearAndSelect + ) + self.assertListEqual(self.widget.search_features, [self.corpus.domain["Text"]]) + + # send some other data to change values + temp_corpus = Corpus.from_file("andersen") + self.send_signal(self.widget.Inputs.corpus, temp_corpus) + self.assertListEqual(self.widget.search_features, list(temp_corpus.domain)) + + # test if corpus correctly restored for search_features + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertListEqual(self.widget.search_features, [self.corpus.domain["Text"]]) + self.assertEqual(self.widget.regexp_filter, "graph") + + # check if restored values correctly used for filtering + # filter_conent must include only values from the text column + self.assertListEqual( + self.widget.doc_list_model.get_filter_content(), + self.corpus.get_column_view("Text")[0].tolist(), + ) + # only "Text" used for filtering (shown documents with "graph" in Text) + doc_model = self.widget.doc_list.model() + doc_shown = [ + doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount()) + ] + self.assertListEqual(doc_shown, ["Document 7", "Document 8", "Document 9"]) + + def test_data_only_hidden_attributes(self): + for a in self.corpus.domain: + a.attributes["hidden"] = True + self.send_signal(self.widget.Inputs.corpus, self.corpus) + # documents are shown but filter does not work + self.assertEqual(self.widget.doc_list_model.rowCount(), 9) + + def test_token_checkbox(self): + corpus_tokens = BASE_TOKENIZER(self.corpus) + self.send_signal(self.widget.Inputs.corpus, corpus_tokens) + self.assertTrue(self.widget.show_tokens_checkbox.isEnabled()) + self.assertFalse(self.widget.show_tokens_checkbox.isChecked()) + + self.widget.show_tokens_checkbox.setChecked(True) + self.assertTrue(self.widget.show_tokens_checkbox.isChecked()) + + # if corpus without tokens on the input button is dissabled and unchecked + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertFalse(self.widget.show_tokens_checkbox.isChecked()) + self.assertFalse(self.widget.show_tokens_checkbox.isEnabled()) + + def test_image(self): + im_attr = StringVariable("Image") + im_attr.attributes["origin"] = "/path/to/image" + im_attr.attributes["type"] = "image" + domain = self.corpus.domain + im_corpus = self.corpus.transform( + Domain(domain.attributes, metas=domain.metas + (im_attr,)) + ) + with im_corpus.unlocked(im_corpus.metas): + im_corpus[:, im_attr] = np.array(["image_name"] + [""] * 8).reshape(-1, 1) + self.send_signal(self.widget.Inputs.corpus, im_corpus) + # tried to get content from the view to test correctness and cannot find + # a nice way also patching does not work on all systems, just testing + # that having image in corpus does not fail + + def test_migrate_settings(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + packed_data = self.widget.settingsHandler.pack_data(self.widget) + context = packed_data["context_settings"][0] + # we borrow display_features from setting extracted from widget, it + # contains Category and Text + context.values["display_indices"] = [0] + context.values["search_indices"] = [1] + context.values["__version__"] = 1 + context.attributes = tuple(context.attributes.items()) + context.attributes = context.attributes + self.widget = self.create_widget( + OWCorpusViewer, + stored_settings={"context_settings": [context], "__version__": 1}, + ) + self.send_signal(self.widget.Inputs.corpus, self.corpus, widget=self.widget) + domain = self.corpus.domain + self.assertListEqual(self.widget.display_features, [domain["Category"]]) + self.assertListEqual(self.widget.search_features, [domain["Text"]]) + + if __name__ == "__main__": unittest.main()