From 2f6d3f92270d6c5c6a8015c81191e4587f6aea48 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 16 Sep 2022 16:04:41 +0200
Subject: [PATCH] Speedup corpus viewer
---
orangecontrib/text/widgets/owcorpusviewer.py | 819 +++++++++++-------
.../text/widgets/tests/test_owcorpusviewer.py | 280 +++++-
2 files changed, 760 insertions(+), 339 deletions(-)
diff --git a/orangecontrib/text/widgets/owcorpusviewer.py b/orangecontrib/text/widgets/owcorpusviewer.py
index 44c56fd24..8ecae15f4 100644
--- a/orangecontrib/text/widgets/owcorpusviewer.py
+++ b/orangecontrib/text/widgets/owcorpusviewer.py
@@ -1,27 +1,291 @@
import os
import re
import sre_constants
-from itertools import chain
-from typing import Set
+from typing import Any, Iterable, List, Set
+import numpy as np
from AnyQt.QtCore import (
- Qt, QUrl, QItemSelection, QItemSelectionModel, QItemSelectionRange
+ QAbstractListModel,
+ QEvent,
+ QItemSelection,
+ QItemSelectionModel,
+ QItemSelectionRange,
+ QModelIndex,
+ QSortFilterProxyModel,
+ Qt,
+ QUrl,
)
-
-from AnyQt.QtGui import QStandardItemModel, QStandardItem
-from AnyQt.QtWidgets import (QListView, QSizePolicy, QTableView,
- QAbstractItemView, QHeaderView, QSplitter,
- QApplication)
-
-from Orange.data.domain import filter_visible
+from AnyQt.QtWidgets import (
+ QAbstractItemView,
+ QApplication,
+ QHeaderView,
+ QListView,
+ QSizePolicy,
+ QSplitter,
+ QTableView,
+)
+from Orange.data import Variable
+from Orange.data.domain import Domain, filter_visible
from Orange.widgets import gui
-from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler
+from Orange.widgets.settings import ContextSetting, Setting, DomainContextHandler
from Orange.widgets.utils.annotated_data import create_annotated_table
-from Orange.widgets.widget import OWWidget, Msg, Input, Output
+from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
+from Orange.widgets.utils.itemmodels import DomainModel
+from Orange.widgets.widget import Input, Msg, Output, OWWidget
+from orangecanvas.gui.utils import disconnected
+from orangewidget.utils.listview import ListViewSearch
+
from orangecontrib.text.corpus import Corpus
+HTML = """
+
+
+
+
+
+
+
+
+
+
+{}
+
+
+"""
+SEPARATOR = (
+ ' | |
| |
'
+)
+
-class OWCorpusViewer(OWWidget):
+def _count_matches(content: List[str], search_string: str, state: TaskState) -> int:
+ """
+ Count number of appears of any terms in search_string in content texts.
+
+ Parameters
+ ----------
+ content
+ List of texts where we count appearances
+ search_string
+ Strings that are searched in texts. This parameter has a format
+ term1|term2|term3|...
+
+ Returns
+ -------
+ Number of all matches of search_string in all texts in content list
+ """
+ matches = 0
+ if search_string:
+ regex = re.compile(search_string.strip("|"), re.IGNORECASE)
+ for i, text in enumerate(content):
+ matches += len(regex.findall(text))
+ state.set_progress_value((i + 1) / len(content) * 100)
+ return matches
+
+
+class DocumentListModel(QAbstractListModel):
+ """
+ Custom model for listing documents. Using custom model since Onrage's
+ pylistmodel is too slow for large number of documents
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.__visible_data = []
+ self.__filter_content = []
+
+ def data(self, index: QModelIndex, role: int = Qt.DisplayRole) -> Any:
+ if role == Qt.DisplayRole:
+ return self.__visible_data[index.row()]
+ elif role == Qt.UserRole:
+ return self.__filter_content[index.row()]
+
+ def rowCount(self, parent: QModelIndex = None, *args, **kwargs) -> int:
+ return len(self.__visible_data)
+
+ def setup_data(self, data: List[str], content: List[str]):
+ self.beginResetModel()
+ self.__visible_data = data
+ self.__filter_content = content
+ self.endResetModel()
+
+ def update_filter_content(self, content: List[str]):
+ assert len(content) == len(self.__visible_data)
+ self.__filter_content = content
+
+ def get_filter_content(self) -> List[str]:
+ return self.__filter_content
+
+ def clear(self):
+ self.beginResetModel()
+ self.__visible_data = []
+ self.__filter_content = []
+ self.endResetModel()
+
+
+class DocumentsFilterProxyModel(QSortFilterProxyModel):
+ """Filter model for documents list"""
+
+ __regex = None
+
+ def set_filter_string(self, filter_string: str):
+ self.__regex = re.compile(filter_string.strip("|"), re.IGNORECASE)
+ self.invalidateFilter()
+
+ def filterAcceptsRow(self, source_row: int, source_parent: QModelIndex) -> bool:
+ """Filter document that mathc the filter string"""
+ if self.__regex is None:
+ # filter is not defined yet - show all
+ return True
+ else:
+ index = self.sourceModel().index(source_row, 0, source_parent)
+ content = self.sourceModel().data(index, Qt.UserRole)
+ res = self.__regex.search(content)
+ return bool(res)
+
+
+class DocumentTableView(QTableView):
+ """TableView that disables unselecting all items"""
+
+ def selectionCommand(
+ self, index: QModelIndex, event: QEvent = None
+ ) -> QItemSelectionModel.SelectionFlags:
+ flags = super().selectionCommand(index, event)
+ selmodel = self.selectionModel()
+ if not index.isValid(): # Click on empty viewport; don't clear
+ return QItemSelectionModel.NoUpdate
+ if selmodel.isSelected(index):
+ currsel = selmodel.selectedIndexes()
+ if len(currsel) == 1 and index == currsel[0]:
+ # Is the last selected index; do not deselect it
+ return QItemSelectionModel.NoUpdate
+ if (
+ event is not None
+ and event.type() == QEvent.MouseMove
+ and flags & QItemSelectionModel.ToggleCurrent
+ ):
+ # Disable ctrl drag 'toggle'; can be made to deselect the last
+ # index, would need to keep track of the current selection
+ # (selectionModel does this but does not expose it)
+ flags &= ~QItemSelectionModel.Toggle
+ flags |= QItemSelectionModel.Select
+ return flags
+
+
+class VariableListViewSearch(ListViewSearch):
+ """ListViewSearch that disables unselecting all items in the list"""
+
+ def selectionCommand(
+ self, index: QModelIndex, event: QEvent = None
+ ) -> QItemSelectionModel.SelectionFlags:
+ flags = super().selectionCommand(index, event)
+ selmodel = self.selectionModel()
+ if not index.isValid(): # Click on empty viewport; don't clear
+ return QItemSelectionModel.NoUpdate
+ if selmodel.isSelected(index):
+ currsel = selmodel.selectedIndexes()
+ if len(currsel) == 1 and index == currsel[0]:
+ # Is the last selected index; do not deselect it
+ return QItemSelectionModel.NoUpdate
+ if (
+ event is not None
+ and event.type() == QEvent.MouseMove
+ and flags & QItemSelectionModel.ToggleCurrent
+ ):
+ # Disable ctrl drag 'toggle'; can be made to deselect the last
+ # index, would need to keep track of the current selection
+ # (selectionModel does this but does not expose it)
+ flags &= ~QItemSelectionModel.Toggle
+ flags |= QItemSelectionModel.Select
+ return flags
+
+ def set_selection(self, items: Iterable[Variable]):
+ """Set selected items in the list view"""
+ model = self.model()
+ values = self.model()[:]
+ items = [it for it in items if it in values]
+ selection = QItemSelection()
+ if items:
+ for val in items:
+ index = values.index(val)
+ selection.merge(
+ QItemSelection(model.index(index, 0), model.index(index, 0)),
+ QItemSelectionModel.Select,
+ )
+ self.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect)
+
+
+class VisibleDomainModel(DomainModel):
+ """Domain model that filter only visible features"""
+
+ def set_domain(self, domain):
+ if domain is not None:
+ domain = Domain(
+ filter_visible(domain.attributes),
+ class_vars=filter_visible(domain.class_vars),
+ metas=filter_visible(domain.metas),
+ )
+ super().set_domain(domain)
+
+
+class OWCorpusViewer(OWWidget, ConcurrentWidgetMixin):
name = "Corpus Viewer"
description = "Display corpus contents."
icon = "icons/CorpusViewer.svg"
@@ -35,74 +299,78 @@ class Outputs:
other_docs = Output("Other Docs", Corpus)
corpus = Output("Corpus", Corpus)
- settingsHandler = PerfectDomainContextHandler(
- match_values = PerfectDomainContextHandler.MATCH_VALUES_ALL
- )
+ settingsHandler = DomainContextHandler()
- search_indices = ContextSetting([], exclude_metas=False) # features included in search
- display_indices = ContextSetting([], exclude_metas=False) # features for display
- display_features = ContextSetting([], exclude_metas=False)
- selected_documents = ContextSetting([])
+ settings_version = 2
+ search_features: List[Variable] = ContextSetting([])
+ display_features: List[Variable] = ContextSetting([])
+ selected_documents: Set[int] = Setting({0}, schema_only=True)
regexp_filter = ContextSetting("")
-
show_tokens = Setting(False)
autocommit = Setting(True)
class Warning(OWWidget.Warning):
- no_feats_search = Msg('No features included in search.')
- no_feats_display = Msg('No features selected for display.')
+ no_feats_search = Msg("No features included in search.")
+ no_feats_display = Msg("No features selected for display.")
def __init__(self):
super().__init__()
+ ConcurrentWidgetMixin.__init__(self)
- self.corpus = None # Corpus
- self.corpus_docs = None # Documents generated from Corpus
- self.doc_webview = None # WebView for showing content
- self.search_features = [] # two copies are needed since Display allows drag & drop
- self.display_list_indices = [0]
- self.matches = 0 # Matches of the query
+ self.corpus = None # Corpus
+ self.__pending_selected_documents = self.selected_documents
# Info attributes
self.update_info()
- info_box = gui.widgetBox(self.controlArea, 'Info')
- gui.label(info_box, self, 'Tokens: %(n_tokens)s')
- gui.label(info_box, self, 'Types: %(n_types)s')
- gui.label(info_box, self, 'Matching documents: %(n_matching)s')
- gui.label(info_box, self, 'Matches: %(n_matches)s')
+ info_box = gui.widgetBox(self.controlArea, "Info")
+ gui.label(info_box, self, "Tokens: %(n_tokens)s")
+ gui.label(info_box, self, "Types: %(n_types)s")
+ gui.label(info_box, self, "Matching documents: %(n_matching)s")
+ gui.label(info_box, self, "Matches: %(n_matches)s")
# Search features
- self.search_listbox = gui.listBox(
- self.controlArea, self, 'search_indices', 'search_features',
- selectionMode=QListView.ExtendedSelection,
- box='Search features', callback=self.search_features_changed)
+ ex_sel = QListView.ExtendedSelection
+ search_box = gui.widgetBox(self.controlArea, "Search features")
+ self.search_listbox = sl = VariableListViewSearch(selectionMode=ex_sel)
+ search_box.layout().addWidget(sl)
+ sl.setModel(VisibleDomainModel(separators=False))
+ sl.selectionModel().selectionChanged.connect(self.search_features_changed)
# Display features
- display_box = gui.widgetBox(self.controlArea, 'Display features')
- self.display_listbox = gui.listBox(
- display_box, self, 'display_list_indices', 'display_features',
- selectionMode=QListView.ExtendedSelection,
- callback=self.show_docs, enableDragDrop=True)
- self.show_tokens_checkbox = gui.checkBox(display_box, self, 'show_tokens',
- 'Show Tokens && Tags', callback=self.show_docs)
+ display_box = gui.widgetBox(self.controlArea, "Display features")
+ self.display_listbox = dl = VariableListViewSearch(selectionMode=ex_sel)
+ display_box.layout().addWidget(dl)
+ dl.setModel(VisibleDomainModel(separators=False))
+ dl.selectionModel().selectionChanged.connect(self.display_features_changed)
+
+ self.show_tokens_checkbox = gui.checkBox(
+ display_box,
+ self,
+ "show_tokens",
+ "Show Tokens && Tags",
+ callback=self.show_docs,
+ )
# Auto-commit box
- gui.auto_commit(self.controlArea, self, 'autocommit', 'Send data', 'Auto send is on')
+ gui.auto_commit(
+ self.controlArea, self, "autocommit", "Send data", "Auto send is on"
+ )
# Search
- self.filter_input = gui.lineEdit(self.mainArea, self, 'regexp_filter',
- orientation=Qt.Horizontal,
- sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding,
- QSizePolicy.Fixed),
- label='RegExp Filter:',
- callback=self.refresh_search)
-
- # Main area
- self.splitter = QSplitter(
+ self.filter_input = gui.lineEdit(
+ self.mainArea,
+ self,
+ "regexp_filter",
orientation=Qt.Horizontal,
- childrenCollapsible=False,
+ sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed),
+ label="RegExp Filter:",
+ callback=self.refresh_search,
)
+
+ # Main area
+ self.splitter = QSplitter(orientation=Qt.Horizontal, childrenCollapsible=False)
# Document list
- self.doc_list = QTableView()
+ self.doc_list = DocumentTableView()
self.doc_list.setSelectionBehavior(QTableView.SelectRows)
self.doc_list.setSelectionMode(QTableView.ExtendedSelection)
self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers)
@@ -110,11 +378,11 @@ def __init__(self):
self.doc_list.horizontalHeader().setVisible(False)
self.splitter.addWidget(self.doc_list)
- self.doc_list_model = QStandardItemModel(self)
- self.doc_list.setModel(self.doc_list_model)
- self.doc_list.selectionModel().selectionChanged.connect(
- self.selection_changed
- )
+ self.doc_list_model = DocumentListModel()
+ proxy_model = DocumentsFilterProxyModel()
+ proxy_model.setSourceModel(self.doc_list_model)
+ self.doc_list.setModel(proxy_model)
+ self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed)
# Document contents
self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)
@@ -129,23 +397,11 @@ def set_data(self, corpus=None):
self.closeContext()
self.reset_widget()
self.corpus = corpus
- self.search_features = []
if corpus is not None:
- domain = self.corpus.domain
- # Enable/disable tokens checkbox
- if not self.corpus.has_tokens():
- self.show_tokens_checkbox.setCheckState(Qt.Unchecked)
- self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens())
-
- self.search_features = list(filter_visible(chain(domain.variables, domain.metas)))
- self.display_features = list(filter_visible(chain(domain.variables, domain.metas)))
- self.search_indices = list(range(len(self.search_features)))
- self.display_indices = list(range(len(self.display_features)))
- self.selected_documents = [corpus.titles[0]] if \
- corpus.titles is not None and len(corpus.titles) else []
+ self.setup_controls()
self.openContext(self.corpus)
- self.display_list_indices = self.display_indices
- self.regenerate_docs()
+ self.doc_list.model().set_filter_string(self.regexp_filter)
+ self.select_variables()
self.list_docs()
self.update_info()
self.set_selection()
@@ -155,219 +411,138 @@ def set_data(self, corpus=None):
def reset_widget(self):
# Corpus
self.corpus = None
- self.corpus_docs = None
- self.display_features = []
# Widgets
- self.search_listbox.clear()
- self.display_listbox.clear()
+ self.search_listbox.model().set_domain(None)
+ self.display_listbox.model().set_domain(None)
self.filter_input.clear()
self.update_info()
# Models/vars
- self.search_features.clear()
- self.search_indices.clear()
- self.display_indices.clear()
self.doc_list_model.clear()
# Warnings
self.Warning.clear()
# WebView
- self.doc_webview.setHtml('')
+ self.doc_webview.setHtml("")
+
+ def setup_controls(self):
+ """Setup controls in control area"""
+ domain = self.corpus.domain
+ if not self.corpus.has_tokens():
+ self.show_tokens_checkbox.setCheckState(Qt.Unchecked)
+ self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens())
+ self.search_listbox.model().set_domain(domain)
+ self.display_listbox.model().set_domain(domain)
+ self.search_features = self.search_listbox.model()[:]
+ self.display_features = self.display_listbox.model()[:]
+
+ def select_variables(self):
+ """Set selection to display and search features view boxes"""
+ smodel = self.search_listbox.model()
+ dmodel = self.display_listbox.model()
+ # it can happen that domain handler will set some features that are
+ # not part of domain - remove them
+ self.search_features = [f for f in self.search_features if f in smodel]
+ self.display_features = [f for f in self.display_features if f in dmodel]
+ # if no features after removing non-existent, select all - default
+ if not self.search_features:
+ self.search_features = smodel[:]
+ if not self.display_features:
+ self.display_features = dmodel[:]
+ with disconnected(
+ self.search_listbox.selectionModel().selectionChanged,
+ self.search_features_changed,
+ ):
+ self.search_listbox.set_selection(self.search_features)
+ with disconnected(
+ self.display_listbox.selectionModel().selectionChanged,
+ self.display_features_changed,
+ ):
+ self.display_listbox.set_selection(self.display_features)
def list_docs(self):
- """ List documents into the left scrolling area """
- if self.corpus_docs is None:
- return
- # TODO: remove search_keyword??
- search_keyword = self.regexp_filter.strip('|')
- matches = 0
- try:
- reg = re.compile(search_keyword, re.IGNORECASE)
- except sre_constants.error:
- return
-
- self.doc_list_model.clear()
+ """List documents into the left scrolling area"""
+ docs = self.regenerate_docs()
+ self.doc_list_model.setup_data(self.corpus.titles.tolist(), docs)
- for i, (doc, title, content) in enumerate(zip(self.corpus, self.corpus.titles,
- self.corpus_docs)):
- res = len(list(reg.finditer(content))) if self.regexp_filter else 0
- if not self.regexp_filter or res:
- matches += res
- item = QStandardItem()
- item.setData(str(title), Qt.DisplayRole)
- item.setData(doc, Qt.UserRole)
- self.doc_list_model.appendRow(item)
- self.matches = matches
-
- def get_selected_documents_from_view(self) -> Set[str]:
- """
- Returns
- -------
- Set with names of selected documents in the QTableView
- """
- return {
- i.data(Qt.DisplayRole)
- for i in self.doc_list.selectionModel().selectedRows()
- }
+ def get_selected_indexes(self) -> Set[int]:
+ m = self.doc_list.model().mapToSource
+ return {m(i).row() for i in self.doc_list.selectionModel().selectedRows()}
def set_selection(self) -> None:
"""
Select documents in selected_documents attribute in the view
"""
+ self.selected_documents = self.__pending_selected_documents
+ self.__pending_selected_documents = {0}
view = self.doc_list
model = view.model()
+ source_model = model.sourceModel()
- previously_selected = self.selected_documents.copy()
selection = QItemSelection()
- for row in range(model.rowCount()):
- document = model.data(model.index(row, 0), Qt.DisplayRole)
- if document in self.selected_documents:
- selection.append(QItemSelectionRange(
- view.model().index(row, 0),
- view.model().index(row, 0)
- ))
- view.selectionModel().select(
- selection, QItemSelectionModel.ClearAndSelect
- )
- if len(selection) == 0:
- # in cases when selection is empty qt's selection_changed is not
- # called and so we need to manually trigger show_docs
- self.show_docs()
- # select emmit selection change signal which causes calling
- # selection_changed when filtering it means that documents which
- # are currently filtered out get removed from self.selected_douments
- # we still want to keep them to be still selected after user removes
- # filter
- self.selected_documents = previously_selected
+ self.selected_documents = {
+ r for r in self.selected_documents if r < len(self.corpus)
+ }
+ for row in self.selected_documents:
+ index = model.mapFromSource(source_model.index(row, 0))
+ selection.append(QItemSelectionRange(index, index))
+ # don't emit selection change to avoid double call of commit function
+ # it is already called from set_data
+ with disconnected(
+ self.doc_list.selectionModel().selectionChanged, self.selection_changed
+ ):
+ view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect)
def selection_changed(self) -> None:
- """
- Function is called every time the selection changes - when user select
- new range of documents
- """
- self.selected_documents = self.get_selected_documents_from_view()
+ """Function is called every time the selection changes"""
+ self.selected_documents = self.get_selected_indexes()
self.show_docs()
self.commit.deferred()
def show_docs(self):
- """ Show the selected documents in the right area """
- HTML = '''
-
-
-
-
-
-
-
-
-
-
- {}
-
-
- '''
- self.display_indices = self.display_list_indices
+ """Show the selected documents in the right area"""
if self.corpus is None:
return
self.Warning.no_feats_display.clear()
- if len(self.display_indices) == 0:
+ if len(self.display_features) == 0:
self.Warning.no_feats_display()
if self.show_tokens:
tokens = list(self.corpus.ngrams_iterator(include_postags=True))
- marked_search_features = [f for i, f in enumerate(self.search_features)
- if i in self.search_indices]
-
- html = ''
- for doc_count, index in enumerate(self.doc_list.selectionModel().selectedRows()):
- if doc_count > 0: # add split
- html += ' | |
' \
- ' | |
'
-
- row_ind = index.data(Qt.UserRole).row_index
- for ind in self.display_indices:
- feature = self.display_features[ind]
- value = str(index.data(Qt.UserRole)[feature.name])
- if feature in marked_search_features:
+ parts = []
+ for doc_count, c_index in enumerate(sorted(self.selected_documents)):
+ text = ""
+ for feature in self.display_features:
+ value = str(self.corpus[c_index, feature.name])
+ if feature in self.search_features:
value = self.__mark_text(value)
- value = value.replace('\n', '
')
- is_image = feature.attributes.get('type', '') == 'image'
- if is_image and value != '?':
+ value = value.replace("\n", "
")
+ is_image = feature.attributes.get("type", "") == "image"
+ if is_image and value != "?":
value = os.path.join(feature.attributes.get("origin", ""), value)
value = ''.format(value)
- html += '{}: | ' \
- '{} |
'.format(
- feature.name, value)
+ text += (
+ f'{feature.name}: | '
+ f'{value} |
'
+ )
if self.show_tokens:
- html += 'Tokens & Tags: | ' \
- '{} |
'.format(''.join('{}'.format(
- token) for token in tokens[row_ind]))
-
- html += '
'
+ tokens_ = "".join(
+ f'{token}' for token in tokens[c_index]
+ )
+ text += (
+ f'Tokens & Tags: | '
+ f"{tokens_} |
"
+ )
+ parts.append(text)
+
+ joined = SEPARATOR.join(parts)
+ html = f""
base = QUrl.fromLocalFile(__file__)
self.doc_webview.setHtml(HTML.format(html), base)
def __mark_text(self, text):
- search_keyword = self.regexp_filter.strip('|')
+ search_keyword = self.regexp_filter.strip("|")
if not search_keyword:
return text
@@ -382,71 +557,91 @@ def __mark_text(self, text):
text = list(text)
for m in matches[::-1]:
- text[m.start():m.end()] = list('{}'\
- .format("".join(text[m.start():m.end()])))
-
+ text[m.start() : m.end()] = list(
+ f'{"".join(text[m.start(): m.end()])}'
+ )
return "".join(text)
+ @staticmethod
+ def __get_selected_rows(view: QListView) -> List[Variable]:
+ rows = view.selectionModel().selectedRows()
+ values = view.model()[:]
+ return [values[row.row()] for row in sorted(rows, key=lambda r: r.row())]
+
def search_features_changed(self):
- self.regenerate_docs()
+ self.search_features = self.__get_selected_rows(self.search_listbox)
+ if self.corpus:
+ self.doc_list_model.update_filter_content(self.regenerate_docs())
+ self.doc_list.model().invalidateFilter()
self.refresh_search()
- def regenerate_docs(self):
- self.corpus_docs = None
+ def display_features_changed(self):
+ self.display_features = self.__get_selected_rows(self.display_listbox)
+ self.show_docs()
+
+ def regenerate_docs(self) -> List[str]:
self.Warning.no_feats_search.clear()
- if self.corpus is not None:
- feats = [self.search_features[i] for i in self.search_indices]
- if len(feats) == 0:
- self.Warning.no_feats_search()
- self.corpus_docs = self.corpus.documents_from_features(feats)
+ if len(self.search_features) == 0:
+ self.Warning.no_feats_search()
+ return self.corpus.documents_from_features(self.search_features)
def refresh_search(self):
if self.corpus is not None:
- self.list_docs()
- self.set_selection()
+ self.doc_list.model().set_filter_string(self.regexp_filter)
+ if not self.selected_documents:
+ # when currently selected items are filtered selection is empty
+ # select first element in the view in that case
+ self.doc_list.setCurrentIndex(self.doc_list.model().index(0, 0))
self.update_info()
+ self.start(
+ _count_matches,
+ self.doc_list_model.get_filter_content(),
+ self.regexp_filter,
+ )
+ self.show_docs()
self.commit.deferred()
+ def on_done(self, res: int):
+ """When matches count is done show the result in the label"""
+ self.n_matches = res if res is not None else "n/a"
+
+ def on_exception(self, ex):
+ raise ex
+
def update_info(self):
if self.corpus is not None:
- self.n_matching = '{}/{}'.format(self.doc_list_model.rowCount(), len(self.corpus))
- self.n_matches = self.matches if self.matches else 'n/a'
- self.n_tokens = sum(map(len, self.corpus.tokens)) if self.corpus.has_tokens() else 'n/a'
- self.n_types = len(self.corpus.dictionary) if self.corpus.has_tokens() else 'n/a'
+ has_tokens = self.corpus.has_tokens()
+ self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.corpus)}"
+ self.n_tokens = sum(map(len, self.corpus.tokens)) if has_tokens else "n/a"
+ self.n_types = len(self.corpus.dictionary) if has_tokens else "n/a"
else:
- self.n_matching = ''
- self.n_matches = ''
- self.n_tokens = ''
- self.n_types = ''
+ self.n_matching = "n/a"
+ self.n_matches = "n/a"
+ self.n_tokens = "n/a"
+ self.n_types = "n/a"
@gui.deferred
def commit(self):
matched = unmatched = annotated_corpus = None
- corpus = self.corpus
- if corpus is not None:
- # it returns a set of selected documents which are in view
- selected_docs = self.get_selected_documents_from_view()
- titles = corpus.titles
- matched_mask = [
- i for i, t in enumerate(titles) if t in selected_docs
- ]
- unmatched_mask = [
- i for i, t in enumerate(titles) if t not in selected_docs
- ]
-
- matched = corpus[matched_mask] if len(matched_mask) else None
- unmatched = corpus[unmatched_mask] if len(unmatched_mask) else None
- annotated_corpus = create_annotated_table(corpus, matched_mask)
+ if self.corpus is not None:
+ selected_docs = sorted(self.get_selected_indexes())
+ matched = self.corpus[selected_docs] if selected_docs else None
+ mask = np.ones(len(self.corpus), bool)
+ mask[selected_docs] = 0
+ unmatched = self.corpus[mask] if mask.any() else None
+ annotated_corpus = create_annotated_table(self.corpus, selected_docs)
self.Outputs.matching_docs.send(matched)
self.Outputs.other_docs.send(unmatched)
self.Outputs.corpus.send(annotated_corpus)
def send_report(self):
- self.report_items((
- ("Query", self.regexp_filter),
- ("Matching documents", self.n_matching),
- ("Matches", self.n_matches)
- ))
+ self.report_items(
+ (
+ ("Query", self.regexp_filter),
+ ("Matching documents", self.n_matching),
+ ("Matches", self.n_matches),
+ )
+ )
def showEvent(self, event):
super().showEvent(event)
@@ -460,18 +655,44 @@ def update_splitter(self):
"""
w1, w2 = self.splitter.sizes()
ws = w1 + w2
- if w2 < 2/3 * ws:
- self.splitter.setSizes([int(ws * 1/3), int(ws * 2/3)])
-
+ if w2 < 2 / 3 * ws:
+ self.splitter.setSizes([int(ws * 1 / 3), int(ws * 2 / 3)])
+
+ @classmethod
+ def migrate_context(cls, context, version):
+ if version < 2:
+ f_order = context.values.pop("display_features", None)
+ display_idx = context.values.pop("display_indices", [])
+ search_ids = context.values.pop("search_indices", [])
+ if f_order is not None:
+ f_order = f_order[0]
+ display_features = [f_order[i] for i in display_idx if i < len(f_order)]
+ search_features = [f_order[i] for i in search_ids if i < len(f_order)]
+ context.values["display_features"] = (display_features, -3)
+ context.values["search_features"] = (search_features, -3)
+
+ # old widget used PerfectDomainContextHandler with MATCH_VALUES_ALL
+ # now it uses DomainContextHandler. The difference are:
+ # - perfect handler stores values in tuple while domain in dicts
+ # - domain context handler store class_vars together with attributes
+ # while perfect handler store them separately
+ # - since MATCH_VALUES_ALL was used discrete var values were stored
+ # with var name (replacing them with id for discrete var - 1)
+ if hasattr(context, "class_vars"):
+ context.attributes = {
+ attr: 1 if isinstance(v, list) else v
+ for attr, v in context.attributes + context.class_vars
+ }
+ context.metas = dict(context.metas)
+ delattr(context, "class_vars")
+
+
+if __name__ == "__main__":
+ from orangewidget.utils.widgetpreview import WidgetPreview
-if __name__ == '__main__':
from orangecontrib.text.preprocess import BASE_TOKENIZER
- from orangecontrib.text.tag.pos import AveragedPerceptronTagger
- from orangewidget.utils.widgetpreview import WidgetPreview
- corpus = Corpus.from_file('book-excerpts')
- corpus = corpus[:3]
- tagger = AveragedPerceptronTagger()
- tagged_corpus = tagger(BASE_TOKENIZER(corpus))
- tagged_corpus.ngram_range = (1, 2)
- WidgetPreview(OWCorpusViewer).run(tagged_corpus)
+ corpus_ = Corpus.from_file("book-excerpts")
+ corpus_ = corpus_[:3]
+ corpus_ = BASE_TOKENIZER(corpus_)
+ WidgetPreview(OWCorpusViewer).run(corpus_)
diff --git a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py
index 6d1847868..f19057885 100644
--- a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py
+++ b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py
@@ -1,18 +1,110 @@
import unittest
+from unittest import TestCase
import numpy as np
+from AnyQt.QtCore import QItemSelectionModel, Qt
from AnyQt.QtTest import QSignalSpy
+from orangewidget.settings import Context
+
+from Orange.data import StringVariable, Domain
from Orange.widgets.tests.base import WidgetTest
-from Orange.data import StringVariable
from orangecontrib.text.corpus import Corpus
-from orangecontrib.text.widgets.owcorpusviewer import OWCorpusViewer
+from orangecontrib.text.preprocess import BASE_TOKENIZER
+from orangecontrib.text.widgets.owcorpusviewer import (
+ OWCorpusViewer,
+ DocumentListModel,
+ DocumentsFilterProxyModel,
+)
+
+
+class TestDocumentListModel(TestCase):
+ def test_empty(self):
+ model = DocumentListModel()
+ self.assertEqual(model.rowCount(), 0)
+ self.assertListEqual(model.get_filter_content(), [])
+
+ def test_data(self):
+ model = DocumentListModel()
+ documents = ["Doc 1", "Doc 2", "Doc 3"]
+ contents = ["bar", "foo", "bar foo"]
+ model.setup_data(documents, contents)
+
+ self.assertListEqual(model.get_filter_content(), contents)
+ self.assertEqual(model.rowCount(), 3)
+
+ self.assertEqual(model.data(model.index(0)), documents[0])
+ self.assertEqual(model.data(model.index(1)), documents[1])
+ self.assertEqual(model.data(model.index(2)), documents[2])
+
+ def test_data_method(self):
+ model = DocumentListModel()
+ documents = ["Doc 1", "Doc 2", "Doc 3"]
+ contents = ["bar", "foo", "bar foo"]
+ model.setup_data(documents, contents)
+
+ self.assertEqual(model.data(model.index(0), Qt.DisplayRole), documents[0])
+ self.assertEqual(model.data(model.index(1), Qt.DisplayRole), documents[1])
+ self.assertEqual(model.data(model.index(2), Qt.DisplayRole), documents[2])
+
+ self.assertEqual(model.data(model.index(0), Qt.UserRole), contents[0])
+ self.assertEqual(model.data(model.index(1), Qt.UserRole), contents[1])
+ self.assertEqual(model.data(model.index(2), Qt.UserRole), contents[2])
+
+ self.assertIsNone(model.data(model.index(2), Qt.BackgroundRole))
+
+ def test_update_filter_content(self):
+ model = DocumentListModel()
+ documents = ["Doc 1", "Doc 2", "Doc 3"]
+ contents = ["bar", "foo", "bar foo"]
+ model.setup_data(documents, contents)
+
+ model.update_filter_content(["a", "b", "c"])
+ self.assertEqual(model.data(model.index(0), Qt.UserRole), "a")
+ self.assertEqual(model.data(model.index(1), Qt.UserRole), "b")
+ self.assertEqual(model.data(model.index(2), Qt.UserRole), "c")
+
+ with self.assertRaises(AssertionError):
+ model.update_filter_content(
+ [
+ "a",
+ "b",
+ ]
+ )
+
+
+class TestFilterModel(TestCase):
+ def test_filter_model(self):
+ model = DocumentListModel()
+ filter_model = DocumentsFilterProxyModel()
+ filter_model.setSourceModel(model)
+ documents = ["Doc 1", "Doc 2", "Doc 3"]
+ contents = ["bar", "foo", "bar foo"]
+ model.setup_data(documents, contents)
+
+ # __regex is None - all data shown
+ self.assertEqual(filter_model.rowCount(), 3)
+ self.assertEqual(filter_model.data(filter_model.index(0, 0)), documents[0])
+ self.assertEqual(filter_model.data(filter_model.index(1, 0)), documents[1])
+ self.assertEqual(filter_model.data(filter_model.index(2, 0)), documents[2])
+
+ # with regex set
+ filter_model.set_filter_string("bar")
+ self.assertEqual(filter_model.rowCount(), 2)
+ self.assertEqual(filter_model.data(filter_model.index(0, 0)), documents[0])
+ self.assertEqual(filter_model.data(filter_model.index(1, 0)), documents[2])
+
+ def test_empty_model(self):
+ model = DocumentListModel()
+ filter_model = DocumentsFilterProxyModel()
+ filter_model.setSourceModel(model)
+ self.assertEqual(filter_model.rowCount(), 0)
class TestCorpusViewerWidget(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWCorpusViewer)
- self.corpus = Corpus.from_file('deerwester')
+ self.corpus = Corpus.from_file("deerwester")
def test_data(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
@@ -31,23 +123,23 @@ def test_search(self):
self.process_events()
out_corpus = self.get_output(self.widget.Outputs.matching_docs)
self.assertEqual(len(out_corpus), 1)
- self.assertEqual(self.widget.matches, 7)
+ self.assertEqual(self.widget.n_matches, 7)
# first document is selected, when filter with word that is not in
- # selected document out_corpus is None
+ # selected document, first of shown documents is selected
self.widget.regexp_filter = "graph"
self.widget.refresh_search()
self.process_events()
- out_corpus = self.get_output(self.widget.Outputs.matching_docs)
- self.assertIsNone(out_corpus)
+ self.assertEqual(1, len(self.get_output(self.widget.Outputs.matching_docs)))
# word count doesn't depend on selection
- self.assertEqual(self.widget.matches, 7)
+ self.assertEqual(self.widget.n_matches, 7)
# when filter is removed, matched words is 0
self.widget.regexp_filter = ""
self.widget.refresh_search()
self.process_events()
- self.assertEqual(self.widget.matches, 0)
+ self.wait_until_finished()
+ self.assertEqual(self.widget.n_matches, 0)
def test_highlighting(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
@@ -68,15 +160,11 @@ def test_highlighting(self):
self.assertIn('', html)
def test_highlighting_non_latin(self):
- documents = [
- {
- 'content': """царстве есть сад с молодильными яблоками"""
- }
- ]
+ documents = [{"content": """царстве есть сад с молодильными яблоками"""}]
metas = [
- (StringVariable('content'), lambda doc: doc.get('content')),
+ (StringVariable("content"), lambda doc: doc.get("content")),
]
- dataset_name = 'RussianDocument'
+ dataset_name = "RussianDocument"
corpus = Corpus.from_documents(documents, dataset_name, metas=metas)
self.send_signal(self.widget.Inputs.corpus, corpus)
@@ -90,56 +178,54 @@ def test_highlighting_non_latin(self):
self.assertIn('', html)
def test_output(self):
- """ Output is intersection between selection and filter """
+ """Output is intersection between selection and filter"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.widget.regexp_filter = "graph"
self.widget.refresh_search()
self.process_events()
- self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs))
+ # when intersection is empty automatically select first document shown
+ mathing = self.get_output(self.widget.Outputs.matching_docs)
+ self.assertEqual(1, len(mathing))
self.assertEqual(
- 9, len(self.get_output(self.widget.Outputs.other_docs))
+ mathing.get_column_view("Text")[0][0],
+ "The generation of random binary unordered trees",
)
+ self.assertEqual(8, len(self.get_output(self.widget.Outputs.other_docs)))
self.assertEqual(
len(self.corpus.domain.metas) + 1,
- len(self.get_output(self.widget.Outputs.corpus).domain.metas)
+ len(self.get_output(self.widget.Outputs.corpus).domain.metas),
)
self.widget.doc_list.selectAll() # selects current documents in list
- self.assertEqual(
- 4, len(self.get_output(self.widget.Outputs.matching_docs))
- )
- self.assertEqual(
- 5, len(self.get_output(self.widget.Outputs.other_docs))
- )
+ self.assertEqual(4, len(self.get_output(self.widget.Outputs.matching_docs)))
+ self.assertEqual(5, len(self.get_output(self.widget.Outputs.other_docs)))
output = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(
len(self.get_output(self.widget.Outputs.matching_docs)),
- sum(output.get_column_view("Selected")[0])
+ sum(output.get_column_view("Selected")[0]),
)
self.widget.regexp_filter = "human"
self.widget.refresh_search()
self.process_events()
- # empty because none of matching documents is selected
- self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs))
+ # when intersection is empty automatically select first document shown
+ mathing = self.get_output(self.widget.Outputs.matching_docs)
+ self.assertEqual(1, len(mathing))
self.assertEqual(
- 9, len(self.get_output(self.widget.Outputs.other_docs))
+ mathing.get_column_view("Text")[0][0],
+ "Human machine interface for lab abc computer applications",
)
+ self.assertEqual(8, len(self.get_output(self.widget.Outputs.other_docs)))
output = self.get_output(self.widget.Outputs.corpus)
- self.assertEqual(0,
- sum(output.get_column_view("Selected")[0]))
+ self.assertEqual(1, sum(output.get_column_view("Selected")[0]))
self.widget.doc_list.selectAll()
- self.assertEqual(
- 5, len(self.get_output(self.widget.Outputs.matching_docs))
- )
- self.assertEqual(
- 4, len(self.get_output(self.widget.Outputs.other_docs))
- )
+ self.assertEqual(5, len(self.get_output(self.widget.Outputs.matching_docs)))
+ self.assertEqual(4, len(self.get_output(self.widget.Outputs.other_docs)))
output = self.get_output(self.widget.Outputs.corpus)
self.assertEqual(
len(self.get_output(self.widget.Outputs.matching_docs)),
- sum(output.get_column_view("Selected")[0])
+ sum(output.get_column_view("Selected")[0]),
)
self.send_signal(self.widget.Inputs.corpus, None)
@@ -149,7 +235,7 @@ def test_output(self):
def test_empty_corpus(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus[:0])
- self.assertListEqual(self.widget.selected_documents, [])
+ self.assertSetEqual(self.widget.selected_documents, set())
self.assertEqual(self.widget.doc_list.model().rowCount(), 0)
def test_report(self):
@@ -159,6 +245,120 @@ def test_report(self):
self.process_events()
self.widget.send_report()
+ def test_filter_attributes(self):
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.widget.filter_input.setText("graph")
+ self.widget.refresh_search()
+
+ # all attributes used for filtering (shown documents with "graph" in Category)
+ doc_model = self.widget.doc_list.model()
+ doc_shown = [
+ doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount())
+ ]
+ self.assertListEqual(
+ doc_shown, ["Document 6", "Document 7", "Document 8", "Document 9"]
+ )
+
+ # only "Text" used for filtering (shown documents with "graph" in Text)
+ slv = self.widget.search_listbox
+ slv.selectionModel().select(
+ slv.model().index(1), QItemSelectionModel.ClearAndSelect
+ )
+ doc_shown = [
+ doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount())
+ ]
+ self.assertListEqual(doc_shown, ["Document 7", "Document 8", "Document 9"])
+
+ def test_filters_restored_from_context(self):
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.widget.filter_input.setText("graph")
+ self.widget.refresh_search()
+ slv = self.widget.search_listbox
+ slv.selectionModel().select(
+ slv.model().index(1), QItemSelectionModel.ClearAndSelect
+ )
+ self.assertListEqual(self.widget.search_features, [self.corpus.domain["Text"]])
+
+ # send some other data to change values
+ temp_corpus = Corpus.from_file("andersen")
+ self.send_signal(self.widget.Inputs.corpus, temp_corpus)
+ self.assertListEqual(self.widget.search_features, list(temp_corpus.domain))
+
+ # test if corpus correctly restored for search_features
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertListEqual(self.widget.search_features, [self.corpus.domain["Text"]])
+ self.assertEqual(self.widget.regexp_filter, "graph")
+
+ # check if restored values correctly used for filtering
+ # filter_conent must include only values from the text column
+ self.assertListEqual(
+ self.widget.doc_list_model.get_filter_content(),
+ self.corpus.get_column_view("Text")[0].tolist(),
+ )
+ # only "Text" used for filtering (shown documents with "graph" in Text)
+ doc_model = self.widget.doc_list.model()
+ doc_shown = [
+ doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount())
+ ]
+ self.assertListEqual(doc_shown, ["Document 7", "Document 8", "Document 9"])
+
+ def test_data_only_hidden_attributes(self):
+ for a in self.corpus.domain:
+ a.attributes["hidden"] = True
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ # documents are shown but filter does not work
+ self.assertEqual(self.widget.doc_list_model.rowCount(), 9)
+
+ def test_token_checkbox(self):
+ corpus_tokens = BASE_TOKENIZER(self.corpus)
+ self.send_signal(self.widget.Inputs.corpus, corpus_tokens)
+ self.assertTrue(self.widget.show_tokens_checkbox.isEnabled())
+ self.assertFalse(self.widget.show_tokens_checkbox.isChecked())
+
+ self.widget.show_tokens_checkbox.setChecked(True)
+ self.assertTrue(self.widget.show_tokens_checkbox.isChecked())
+
+ # if corpus without tokens on the input button is dissabled and unchecked
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertFalse(self.widget.show_tokens_checkbox.isChecked())
+ self.assertFalse(self.widget.show_tokens_checkbox.isEnabled())
+
+ def test_image(self):
+ im_attr = StringVariable("Image")
+ im_attr.attributes["origin"] = "/path/to/image"
+ im_attr.attributes["type"] = "image"
+ domain = self.corpus.domain
+ im_corpus = self.corpus.transform(
+ Domain(domain.attributes, metas=domain.metas + (im_attr,))
+ )
+ with im_corpus.unlocked(im_corpus.metas):
+ im_corpus[:, im_attr] = np.array(["image_name"] + [""] * 8).reshape(-1, 1)
+ self.send_signal(self.widget.Inputs.corpus, im_corpus)
+ # tried to get content from the view to test correctness and cannot find
+ # a nice way also patching does not work on all systems, just testing
+ # that having image in corpus does not fail
+
+ def test_migrate_settings(self):
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ packed_data = self.widget.settingsHandler.pack_data(self.widget)
+ context = packed_data["context_settings"][0]
+ # we borrow display_features from setting extracted from widget, it
+ # contains Category and Text
+ context.values["display_indices"] = [0]
+ context.values["search_indices"] = [1]
+ context.values["__version__"] = 1
+ context.attributes = tuple(context.attributes.items())
+ context.attributes = context.attributes
+ self.widget = self.create_widget(
+ OWCorpusViewer,
+ stored_settings={"context_settings": [context], "__version__": 1},
+ )
+ self.send_signal(self.widget.Inputs.corpus, self.corpus, widget=self.widget)
+ domain = self.corpus.domain
+ self.assertListEqual(self.widget.display_features, [domain["Category"]])
+ self.assertListEqual(self.widget.search_features, [domain["Text"]])
+
+
if __name__ == "__main__":
unittest.main()