From 2f6d3f92270d6c5c6a8015c81191e4587f6aea48 Mon Sep 17 00:00:00 2001
From: PrimozGodec <p.godec9@gmail.com>
Date: Fri, 16 Sep 2022 16:04:41 +0200
Subject: [PATCH] Speedup corpus viewer

---
 orangecontrib/text/widgets/owcorpusviewer.py  | 819 +++++++++++-------
 .../text/widgets/tests/test_owcorpusviewer.py | 280 +++++-
 2 files changed, 760 insertions(+), 339 deletions(-)
diff --git a/orangecontrib/text/widgets/owcorpusviewer.py b/orangecontrib/text/widgets/owcorpusviewer.py
index 44c56fd24..8ecae15f4 100644
--- a/orangecontrib/text/widgets/owcorpusviewer.py
+++ b/orangecontrib/text/widgets/owcorpusviewer.py
@@ -1,27 +1,291 @@
 import os
 import re
 import sre_constants
-from itertools import chain
-from typing import Set
+from typing import Any, Iterable, List, Set
 
+import numpy as np
 from AnyQt.QtCore import (
-    Qt, QUrl, QItemSelection, QItemSelectionModel, QItemSelectionRange
+    QAbstractListModel,
+    QEvent,
+    QItemSelection,
+    QItemSelectionModel,
+    QItemSelectionRange,
+    QModelIndex,
+    QSortFilterProxyModel,
+    Qt,
+    QUrl,
 )
-
-from AnyQt.QtGui import QStandardItemModel, QStandardItem
-from AnyQt.QtWidgets import (QListView, QSizePolicy, QTableView,
-                             QAbstractItemView, QHeaderView, QSplitter,
-                             QApplication)
-
-from Orange.data.domain import filter_visible
+from AnyQt.QtWidgets import (
+    QAbstractItemView,
+    QApplication,
+    QHeaderView,
+    QListView,
+    QSizePolicy,
+    QSplitter,
+    QTableView,
+)
+from Orange.data import Variable
+from Orange.data.domain import Domain, filter_visible
 from Orange.widgets import gui
-from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler
+from Orange.widgets.settings import ContextSetting, Setting, DomainContextHandler
 from Orange.widgets.utils.annotated_data import create_annotated_table
-from Orange.widgets.widget import OWWidget, Msg, Input, Output
+from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
+from Orange.widgets.utils.itemmodels import DomainModel
+from Orange.widgets.widget import Input, Msg, Output, OWWidget
+from orangecanvas.gui.utils import disconnected
+from orangewidget.utils.listview import ListViewSearch
+
 from orangecontrib.text.corpus import Corpus
 
+HTML = """
+<!doctype html>
+<html>
+<head>
+<script type="text/javascript" src="resources/jquery-3.1.1.min.js">
+</script>
+<script type="text/javascript" src="resources/jquery.mark.min.js">
+</script>
+<script type="text/javascript" src="resources/highlighter.js">
+</script>
+<meta charset='utf-8'>
+<style>
+
+table {{ border-collapse: collapse; }}
+mark {{ background: #FFCD28; }}
+
+tr > td {{
+    padding-bottom: 3px;
+    padding-top: 3px;
+}}
+
+body {{
+    font-family: Helvetica;
+    font-size: 10pt;
+}}
+
+.line {{ border-bottom: 1px solid #000; }}
+.separator {{ height: 5px; }}
+
+.variables {{
+    vertical-align: top;
+    padding-right: 10px;
+}}
+
+.content {{
+    /* Adopted from https://css-tricks.com/snippets/css/prevent-long-urls-from-breaking-out-of-container/ */
+
+    /* These are technically the same, but use both */
+    overflow-wrap: break-word;
+    word-wrap: break-word;
+
+    -ms-word-break: break-all;
+    /* This is the dangerous one in WebKit, as it breaks things wherever */
+    word-break: break-all;
+    /* Instead use this non-standard one: */
+    word-break: break-word;
+
+    /* Adds a hyphen where the word breaks, if supported (No Blink) */
+    -ms-hyphens: auto;
+    -moz-hyphens: auto;
+    -webkit-hyphens: auto;
+    hyphens: auto;
+}}
+
+.token {{
+    padding: 3px;
+    border: 1px #B0B0B0 solid;
+    margin-right: 5px;
+    margin-bottom: 5px;
+    display: inline-block;
+}}
+
+img {{
+    max-width: 100%;
+}}
+
+</style>
+</head>
+<body>
+{}
+</body>
+</html>
+"""
+SEPARATOR = (
+    '<tr class="line separator"><td/><td/></tr><tr class="separator"><td/><td/></tr>'
+)
+
 
-class OWCorpusViewer(OWWidget):
+def _count_matches(content: List[str], search_string: str, state: TaskState) -> int:
+    """
+    Count number of appears of any terms in search_string in content texts.
+
+    Parameters
+    ----------
+    content
+        List of texts where we count appearances
+    search_string
+        Strings that are searched in texts. This parameter has a format
+        term1|term2|term3|...
+
+    Returns
+    -------
+    Number of all matches of search_string in all texts in content list
+    """
+    matches = 0
+    if search_string:
+        regex = re.compile(search_string.strip("|"), re.IGNORECASE)
+        for i, text in enumerate(content):
+            matches += len(regex.findall(text))
+            state.set_progress_value((i + 1) / len(content) * 100)
+    return matches
+
+
+class DocumentListModel(QAbstractListModel):
+    """
+    Custom model for listing documents. Using custom model since Onrage's
+    pylistmodel is too slow for large number of documents
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__visible_data = []
+        self.__filter_content = []
+
+    def data(self, index: QModelIndex, role: int = Qt.DisplayRole) -> Any:
+        if role == Qt.DisplayRole:
+            return self.__visible_data[index.row()]
+        elif role == Qt.UserRole:
+            return self.__filter_content[index.row()]
+
+    def rowCount(self, parent: QModelIndex = None, *args, **kwargs) -> int:
+        return len(self.__visible_data)
+
+    def setup_data(self, data: List[str], content: List[str]):
+        self.beginResetModel()
+        self.__visible_data = data
+        self.__filter_content = content
+        self.endResetModel()
+
+    def update_filter_content(self, content: List[str]):
+        assert len(content) == len(self.__visible_data)
+        self.__filter_content = content
+
+    def get_filter_content(self) -> List[str]:
+        return self.__filter_content
+
+    def clear(self):
+        self.beginResetModel()
+        self.__visible_data = []
+        self.__filter_content = []
+        self.endResetModel()
+
+
+class DocumentsFilterProxyModel(QSortFilterProxyModel):
+    """Filter model for documents list"""
+
+    __regex = None
+
+    def set_filter_string(self, filter_string: str):
+        self.__regex = re.compile(filter_string.strip("|"), re.IGNORECASE)
+        self.invalidateFilter()
+
+    def filterAcceptsRow(self, source_row: int, source_parent: QModelIndex) -> bool:
+        """Filter document that mathc the filter string"""
+        if self.__regex is None:
+            # filter is not defined yet - show all
+            return True
+        else:
+            index = self.sourceModel().index(source_row, 0, source_parent)
+            content = self.sourceModel().data(index, Qt.UserRole)
+            res = self.__regex.search(content)
+            return bool(res)
+
+
+class DocumentTableView(QTableView):
+    """TableView that disables unselecting all items"""
+
+    def selectionCommand(
+        self, index: QModelIndex, event: QEvent = None
+    ) -> QItemSelectionModel.SelectionFlags:
+        flags = super().selectionCommand(index, event)
+        selmodel = self.selectionModel()
+        if not index.isValid():  # Click on empty viewport; don't clear
+            return QItemSelectionModel.NoUpdate
+        if selmodel.isSelected(index):
+            currsel = selmodel.selectedIndexes()
+            if len(currsel) == 1 and index == currsel[0]:
+                # Is the last selected index; do not deselect it
+                return QItemSelectionModel.NoUpdate
+        if (
+            event is not None
+            and event.type() == QEvent.MouseMove
+            and flags & QItemSelectionModel.ToggleCurrent
+        ):
+            # Disable ctrl drag 'toggle'; can be made to deselect the last
+            # index, would need to keep track of the current selection
+            # (selectionModel does this but does not expose it)
+            flags &= ~QItemSelectionModel.Toggle
+            flags |= QItemSelectionModel.Select
+        return flags
+
+
+class VariableListViewSearch(ListViewSearch):
+    """ListViewSearch that disables unselecting all items in the list"""
+
+    def selectionCommand(
+        self, index: QModelIndex, event: QEvent = None
+    ) -> QItemSelectionModel.SelectionFlags:
+        flags = super().selectionCommand(index, event)
+        selmodel = self.selectionModel()
+        if not index.isValid():  # Click on empty viewport; don't clear
+            return QItemSelectionModel.NoUpdate
+        if selmodel.isSelected(index):
+            currsel = selmodel.selectedIndexes()
+            if len(currsel) == 1 and index == currsel[0]:
+                # Is the last selected index; do not deselect it
+                return QItemSelectionModel.NoUpdate
+        if (
+            event is not None
+            and event.type() == QEvent.MouseMove
+            and flags & QItemSelectionModel.ToggleCurrent
+        ):
+            # Disable ctrl drag 'toggle'; can be made to deselect the last
+            # index, would need to keep track of the current selection
+            # (selectionModel does this but does not expose it)
+            flags &= ~QItemSelectionModel.Toggle
+            flags |= QItemSelectionModel.Select
+        return flags
+
+    def set_selection(self, items: Iterable[Variable]):
+        """Set selected items in the list view"""
+        model = self.model()
+        values = self.model()[:]
+        items = [it for it in items if it in values]
+        selection = QItemSelection()
+        if items:
+            for val in items:
+                index = values.index(val)
+                selection.merge(
+                    QItemSelection(model.index(index, 0), model.index(index, 0)),
+                    QItemSelectionModel.Select,
+                )
+        self.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect)
+
+
+class VisibleDomainModel(DomainModel):
+    """Domain model that filter only visible features"""
+
+    def set_domain(self, domain):
+        if domain is not None:
+            domain = Domain(
+                filter_visible(domain.attributes),
+                class_vars=filter_visible(domain.class_vars),
+                metas=filter_visible(domain.metas),
+            )
+        super().set_domain(domain)
+
+
+class OWCorpusViewer(OWWidget, ConcurrentWidgetMixin):
     name = "Corpus Viewer"
     description = "Display corpus contents."
     icon = "icons/CorpusViewer.svg"
@@ -35,74 +299,78 @@ class Outputs:
         other_docs = Output("Other Docs", Corpus)
         corpus = Output("Corpus", Corpus)
 
-    settingsHandler = PerfectDomainContextHandler(
-        match_values = PerfectDomainContextHandler.MATCH_VALUES_ALL
-    )
+    settingsHandler = DomainContextHandler()
 
-    search_indices = ContextSetting([], exclude_metas=False)   # features included in search
-    display_indices = ContextSetting([], exclude_metas=False)  # features for display
-    display_features = ContextSetting([], exclude_metas=False)
-    selected_documents = ContextSetting([])
+    settings_version = 2
+    search_features: List[Variable] = ContextSetting([])
+    display_features: List[Variable] = ContextSetting([])
+    selected_documents: Set[int] = Setting({0}, schema_only=True)
     regexp_filter = ContextSetting("")
-
     show_tokens = Setting(False)
     autocommit = Setting(True)
 
     class Warning(OWWidget.Warning):
-        no_feats_search = Msg('No features included in search.')
-        no_feats_display = Msg('No features selected for display.')
+        no_feats_search = Msg("No features included in search.")
+        no_feats_display = Msg("No features selected for display.")
 
     def __init__(self):
         super().__init__()
+        ConcurrentWidgetMixin.__init__(self)
 
-        self.corpus = None              # Corpus
-        self.corpus_docs = None         # Documents generated from Corpus
-        self.doc_webview = None         # WebView for showing content
-        self.search_features = []       # two copies are needed since Display allows drag & drop
-        self.display_list_indices = [0]
-        self.matches = 0                # Matches of the query
+        self.corpus = None  # Corpus
+        self.__pending_selected_documents = self.selected_documents
 
         # Info attributes
         self.update_info()
-        info_box = gui.widgetBox(self.controlArea, 'Info')
-        gui.label(info_box, self, 'Tokens: %(n_tokens)s')
-        gui.label(info_box, self, 'Types: %(n_types)s')
-        gui.label(info_box, self, 'Matching documents: %(n_matching)s')
-        gui.label(info_box, self, 'Matches: %(n_matches)s')
+        info_box = gui.widgetBox(self.controlArea, "Info")
+        gui.label(info_box, self, "Tokens: %(n_tokens)s")
+        gui.label(info_box, self, "Types: %(n_types)s")
+        gui.label(info_box, self, "Matching documents: %(n_matching)s")
+        gui.label(info_box, self, "Matches: %(n_matches)s")
 
         # Search features
-        self.search_listbox = gui.listBox(
-            self.controlArea, self, 'search_indices', 'search_features',
-            selectionMode=QListView.ExtendedSelection,
-            box='Search features', callback=self.search_features_changed)
+        ex_sel = QListView.ExtendedSelection
+        search_box = gui.widgetBox(self.controlArea, "Search features")
+        self.search_listbox = sl = VariableListViewSearch(selectionMode=ex_sel)
+        search_box.layout().addWidget(sl)
+        sl.setModel(VisibleDomainModel(separators=False))
+        sl.selectionModel().selectionChanged.connect(self.search_features_changed)
 
         # Display features
-        display_box = gui.widgetBox(self.controlArea, 'Display features')
-        self.display_listbox = gui.listBox(
-            display_box, self, 'display_list_indices', 'display_features',
-            selectionMode=QListView.ExtendedSelection,
-            callback=self.show_docs, enableDragDrop=True)
-        self.show_tokens_checkbox = gui.checkBox(display_box, self, 'show_tokens',
-                                                 'Show Tokens && Tags', callback=self.show_docs)
+        display_box = gui.widgetBox(self.controlArea, "Display features")
+        self.display_listbox = dl = VariableListViewSearch(selectionMode=ex_sel)
+        display_box.layout().addWidget(dl)
+        dl.setModel(VisibleDomainModel(separators=False))
+        dl.selectionModel().selectionChanged.connect(self.display_features_changed)
+
+        self.show_tokens_checkbox = gui.checkBox(
+            display_box,
+            self,
+            "show_tokens",
+            "Show Tokens && Tags",
+            callback=self.show_docs,
+        )
 
         # Auto-commit box
-        gui.auto_commit(self.controlArea, self, 'autocommit', 'Send data', 'Auto send is on')
+        gui.auto_commit(
+            self.controlArea, self, "autocommit", "Send data", "Auto send is on"
+        )
 
         # Search
-        self.filter_input = gui.lineEdit(self.mainArea, self, 'regexp_filter',
-                                         orientation=Qt.Horizontal,
-                                         sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding,
-                                                                QSizePolicy.Fixed),
-                                         label='RegExp Filter:',
-                                         callback=self.refresh_search)
-
-        # Main area
-        self.splitter = QSplitter(
+        self.filter_input = gui.lineEdit(
+            self.mainArea,
+            self,
+            "regexp_filter",
             orientation=Qt.Horizontal,
-            childrenCollapsible=False,
+            sizePolicy=QSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed),
+            label="RegExp Filter:",
+            callback=self.refresh_search,
         )
+
+        # Main area
+        self.splitter = QSplitter(orientation=Qt.Horizontal, childrenCollapsible=False)
         # Document list
-        self.doc_list = QTableView()
+        self.doc_list = DocumentTableView()
         self.doc_list.setSelectionBehavior(QTableView.SelectRows)
         self.doc_list.setSelectionMode(QTableView.ExtendedSelection)
         self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers)
@@ -110,11 +378,11 @@ def __init__(self):
         self.doc_list.horizontalHeader().setVisible(False)
         self.splitter.addWidget(self.doc_list)
 
-        self.doc_list_model = QStandardItemModel(self)
-        self.doc_list.setModel(self.doc_list_model)
-        self.doc_list.selectionModel().selectionChanged.connect(
-            self.selection_changed
-        )
+        self.doc_list_model = DocumentListModel()
+        proxy_model = DocumentsFilterProxyModel()
+        proxy_model.setSourceModel(self.doc_list_model)
+        self.doc_list.setModel(proxy_model)
+        self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed)
         # Document contents
         self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)
 
@@ -129,23 +397,11 @@ def set_data(self, corpus=None):
         self.closeContext()
         self.reset_widget()
         self.corpus = corpus
-        self.search_features = []
         if corpus is not None:
-            domain = self.corpus.domain
-            # Enable/disable tokens checkbox
-            if not self.corpus.has_tokens():
-                self.show_tokens_checkbox.setCheckState(Qt.Unchecked)
-            self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens())
-
-            self.search_features = list(filter_visible(chain(domain.variables, domain.metas)))
-            self.display_features = list(filter_visible(chain(domain.variables, domain.metas)))
-            self.search_indices = list(range(len(self.search_features)))
-            self.display_indices = list(range(len(self.display_features)))
-            self.selected_documents = [corpus.titles[0]] if \
-                corpus.titles is not None and len(corpus.titles) else []
+            self.setup_controls()
             self.openContext(self.corpus)
-            self.display_list_indices = self.display_indices
-            self.regenerate_docs()
+            self.doc_list.model().set_filter_string(self.regexp_filter)
+            self.select_variables()
             self.list_docs()
             self.update_info()
             self.set_selection()
@@ -155,219 +411,138 @@ def set_data(self, corpus=None):
     def reset_widget(self):
         # Corpus
         self.corpus = None
-        self.corpus_docs = None
-        self.display_features = []
         # Widgets
-        self.search_listbox.clear()
-        self.display_listbox.clear()
+        self.search_listbox.model().set_domain(None)
+        self.display_listbox.model().set_domain(None)
         self.filter_input.clear()
         self.update_info()
         # Models/vars
-        self.search_features.clear()
-        self.search_indices.clear()
-        self.display_indices.clear()
         self.doc_list_model.clear()
         # Warnings
         self.Warning.clear()
         # WebView
-        self.doc_webview.setHtml('')
+        self.doc_webview.setHtml("")
+
+    def setup_controls(self):
+        """Setup controls in control area"""
+        domain = self.corpus.domain
+        if not self.corpus.has_tokens():
+            self.show_tokens_checkbox.setCheckState(Qt.Unchecked)
+        self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens())
+        self.search_listbox.model().set_domain(domain)
+        self.display_listbox.model().set_domain(domain)
+        self.search_features = self.search_listbox.model()[:]
+        self.display_features = self.display_listbox.model()[:]
+
+    def select_variables(self):
+        """Set selection to display and search features view boxes"""
+        smodel = self.search_listbox.model()
+        dmodel = self.display_listbox.model()
+        # it can happen that domain handler will set some features that are
+        # not part of domain - remove them
+        self.search_features = [f for f in self.search_features if f in smodel]
+        self.display_features = [f for f in self.display_features if f in dmodel]
+        # if no features after removing non-existent, select all - default
+        if not self.search_features:
+            self.search_features = smodel[:]
+        if not self.display_features:
+            self.display_features = dmodel[:]
+        with disconnected(
+            self.search_listbox.selectionModel().selectionChanged,
+            self.search_features_changed,
+        ):
+            self.search_listbox.set_selection(self.search_features)
+        with disconnected(
+            self.display_listbox.selectionModel().selectionChanged,
+            self.display_features_changed,
+        ):
+            self.display_listbox.set_selection(self.display_features)
 
     def list_docs(self):
-        """ List documents into the left scrolling area """
-        if self.corpus_docs is None:
-            return
-        # TODO: remove search_keyword??
-        search_keyword = self.regexp_filter.strip('|')
-        matches = 0
-        try:
-            reg = re.compile(search_keyword, re.IGNORECASE)
-        except sre_constants.error:
-            return
-
-        self.doc_list_model.clear()
+        """List documents into the left scrolling area"""
+        docs = self.regenerate_docs()
+        self.doc_list_model.setup_data(self.corpus.titles.tolist(), docs)
 
-        for i, (doc, title, content) in enumerate(zip(self.corpus, self.corpus.titles,
-                                                      self.corpus_docs)):
-            res = len(list(reg.finditer(content))) if self.regexp_filter else 0
-            if not self.regexp_filter or res:
-                matches += res
-                item = QStandardItem()
-                item.setData(str(title), Qt.DisplayRole)
-                item.setData(doc, Qt.UserRole)
-                self.doc_list_model.appendRow(item)
-        self.matches = matches
-
-    def get_selected_documents_from_view(self) -> Set[str]:
-        """
-        Returns
-        -------
-        Set with names of selected documents in the QTableView
-        """
-        return {
-            i.data(Qt.DisplayRole)
-            for i in self.doc_list.selectionModel().selectedRows()
-        }
+    def get_selected_indexes(self) -> Set[int]:
+        m = self.doc_list.model().mapToSource
+        return {m(i).row() for i in self.doc_list.selectionModel().selectedRows()}
 
     def set_selection(self) -> None:
         """
         Select documents in selected_documents attribute in the view
         """
+        self.selected_documents = self.__pending_selected_documents
+        self.__pending_selected_documents = {0}
         view = self.doc_list
         model = view.model()
+        source_model = model.sourceModel()
 
-        previously_selected = self.selected_documents.copy()
         selection = QItemSelection()
-        for row in range(model.rowCount()):
-            document = model.data(model.index(row, 0), Qt.DisplayRole)
-            if document in self.selected_documents:
-                selection.append(QItemSelectionRange(
-                    view.model().index(row, 0),
-                    view.model().index(row, 0)
-                ))
-        view.selectionModel().select(
-            selection, QItemSelectionModel.ClearAndSelect
-        )
-        if len(selection) == 0:
-            # in cases when selection is empty qt's selection_changed is not
-            # called and so we need to manually trigger show_docs
-            self.show_docs()
-        # select emmit selection change signal which causes calling
-        # selection_changed when filtering it means that documents which
-        # are currently filtered out get removed from self.selected_douments
-        # we still want to keep them to be still selected after user removes
-        # filter
-        self.selected_documents = previously_selected
+        self.selected_documents = {
+            r for r in self.selected_documents if r < len(self.corpus)
+        }
+        for row in self.selected_documents:
+            index = model.mapFromSource(source_model.index(row, 0))
+            selection.append(QItemSelectionRange(index, index))
+        # don't emit selection change to avoid double call of commit function
+        # it is already called from set_data
+        with disconnected(
+            self.doc_list.selectionModel().selectionChanged, self.selection_changed
+        ):
+            view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect)
 
     def selection_changed(self) -> None:
-        """
-        Function is called every time the selection changes - when user select
-        new range of documents
-        """
-        self.selected_documents = self.get_selected_documents_from_view()
+        """Function is called every time the selection changes"""
+        self.selected_documents = self.get_selected_indexes()
         self.show_docs()
         self.commit.deferred()
 
     def show_docs(self):
-        """ Show the selected documents in the right area """
-        HTML = '''
-        <!doctype html>
-        <html>
-        <head>
-        <script type="text/javascript" src="resources/jquery-3.1.1.min.js">
-        </script>
-        <script type="text/javascript" src="resources/jquery.mark.min.js">
-        </script>
-        <script type="text/javascript" src="resources/highlighter.js">
-        </script>
-        <meta charset='utf-8'>
-        <style>
-
-        table {{ border-collapse: collapse; }}
-        mark {{ background: #FFCD28; }}
-
-        tr > td {{
-            padding-bottom: 3px;
-            padding-top: 3px;
-        }}
-
-        body {{
-            font-family: Helvetica;
-            font-size: 10pt;
-        }}
-
-        .line {{ border-bottom: 1px solid #000; }}
-        .separator {{ height: 5px; }}
-
-        .variables {{
-            vertical-align: top;
-            padding-right: 10px;
-        }}
-        
-        .content {{
-            /* Adopted from https://css-tricks.com/snippets/css/prevent-long-urls-from-breaking-out-of-container/ */
-        
-            /* These are technically the same, but use both */
-            overflow-wrap: break-word;
-            word-wrap: break-word;
-        
-            -ms-word-break: break-all;
-            /* This is the dangerous one in WebKit, as it breaks things wherever */
-            word-break: break-all;
-            /* Instead use this non-standard one: */
-            word-break: break-word;
-        
-            /* Adds a hyphen where the word breaks, if supported (No Blink) */
-            -ms-hyphens: auto;
-            -moz-hyphens: auto;
-            -webkit-hyphens: auto;
-            hyphens: auto;
-        }}
-
-        .token {{
-            padding: 3px;
-            border: 1px #B0B0B0 solid;
-            margin-right: 5px;
-            margin-bottom: 5px;
-            display: inline-block;
-        }}
-
-        img {{
-            max-width: 100%;
-        }}
-
-        </style>
-        </head>
-        <body>
-        {}
-        </body>
-        </html>
-        '''
-        self.display_indices = self.display_list_indices
+        """Show the selected documents in the right area"""
         if self.corpus is None:
             return
 
         self.Warning.no_feats_display.clear()
-        if len(self.display_indices) == 0:
+        if len(self.display_features) == 0:
             self.Warning.no_feats_display()
 
         if self.show_tokens:
             tokens = list(self.corpus.ngrams_iterator(include_postags=True))
 
-        marked_search_features = [f for i, f in enumerate(self.search_features)
-                                  if i in self.search_indices]
-
-        html = '<table>'
-        for doc_count, index in enumerate(self.doc_list.selectionModel().selectedRows()):
-            if doc_count > 0:   # add split
-                html += '<tr class="line separator"><td/><td/></tr>' \
-                        '<tr class="separator"><td/><td/></tr>'
-
-            row_ind = index.data(Qt.UserRole).row_index
-            for ind in self.display_indices:
-                feature = self.display_features[ind]
-                value = str(index.data(Qt.UserRole)[feature.name])
-                if feature in marked_search_features:
+        parts = []
+        for doc_count, c_index in enumerate(sorted(self.selected_documents)):
+            text = ""
+            for feature in self.display_features:
+                value = str(self.corpus[c_index, feature.name])
+                if feature in self.search_features:
                     value = self.__mark_text(value)
-                value = value.replace('\n', '<br/>')
-                is_image = feature.attributes.get('type', '') == 'image'
-                if is_image and value != '?':
+                value = value.replace("\n", "<br/>")
+                is_image = feature.attributes.get("type", "") == "image"
+                if is_image and value != "?":
                     value = os.path.join(feature.attributes.get("origin", ""), value)
                     value = '<img src="{}"></img>'.format(value)
-                html += '<tr><td class="variables"><strong>{}:</strong></td>' \
-                        '<td class="content">{}</td></tr>'.format(
-                    feature.name, value)
+                text += (
+                    f'<tr><td class="variables"><strong>{feature.name}:</strong></td>'
+                    f'<td class="content">{value}</td></tr>'
+                )
 
             if self.show_tokens:
-                html += '<tr><td class="variables"><strong>Tokens & Tags:</strong></td>' \
-                        '<td>{}</td></tr>'.format(''.join('<span class="token">{}</span>'.format(
-                    token) for token in tokens[row_ind]))
-
-        html += '</table>'
+                tokens_ = "".join(
+                    f'<span class="token">{token}</span>' for token in tokens[c_index]
+                )
+                text += (
+                    f'<tr><td class="variables"><strong>Tokens & Tags:</strong></td>'
+                    f"<td>{tokens_}</td></tr>"
+                )
+            parts.append(text)
+
+        joined = SEPARATOR.join(parts)
+        html = f"<table>{joined}</table>"
         base = QUrl.fromLocalFile(__file__)
         self.doc_webview.setHtml(HTML.format(html), base)
 
     def __mark_text(self, text):
-        search_keyword = self.regexp_filter.strip('|')
+        search_keyword = self.regexp_filter.strip("|")
         if not search_keyword:
             return text
 
@@ -382,71 +557,91 @@ def __mark_text(self, text):
 
         text = list(text)
         for m in matches[::-1]:
-            text[m.start():m.end()] = list('<mark data-markjs="true">{}</mark>'\
-                .format("".join(text[m.start():m.end()])))
-
+            text[m.start() : m.end()] = list(
+                f'<mark data-markjs="true">{"".join(text[m.start(): m.end()])}</mark>'
+            )
         return "".join(text)
 
+    @staticmethod
+    def __get_selected_rows(view: QListView) -> List[Variable]:
+        rows = view.selectionModel().selectedRows()
+        values = view.model()[:]
+        return [values[row.row()] for row in sorted(rows, key=lambda r: r.row())]
+
     def search_features_changed(self):
-        self.regenerate_docs()
+        self.search_features = self.__get_selected_rows(self.search_listbox)
+        if self.corpus:
+            self.doc_list_model.update_filter_content(self.regenerate_docs())
+        self.doc_list.model().invalidateFilter()
         self.refresh_search()
 
-    def regenerate_docs(self):
-        self.corpus_docs = None
+    def display_features_changed(self):
+        self.display_features = self.__get_selected_rows(self.display_listbox)
+        self.show_docs()
+
+    def regenerate_docs(self) -> List[str]:
         self.Warning.no_feats_search.clear()
-        if self.corpus is not None:
-            feats = [self.search_features[i] for i in self.search_indices]
-            if len(feats) == 0:
-                self.Warning.no_feats_search()
-            self.corpus_docs = self.corpus.documents_from_features(feats)
+        if len(self.search_features) == 0:
+            self.Warning.no_feats_search()
+        return self.corpus.documents_from_features(self.search_features)
 
     def refresh_search(self):
         if self.corpus is not None:
-            self.list_docs()
-            self.set_selection()
+            self.doc_list.model().set_filter_string(self.regexp_filter)
+            if not self.selected_documents:
+                # when currently selected items are filtered selection is empty
+                # select first element in the view in that case
+                self.doc_list.setCurrentIndex(self.doc_list.model().index(0, 0))
             self.update_info()
+            self.start(
+                _count_matches,
+                self.doc_list_model.get_filter_content(),
+                self.regexp_filter,
+            )
+            self.show_docs()
             self.commit.deferred()
 
+    def on_done(self, res: int):
+        """When matches count is done show the result in the label"""
+        self.n_matches = res if res is not None else "n/a"
+
+    def on_exception(self, ex):
+        raise ex
+
     def update_info(self):
         if self.corpus is not None:
-            self.n_matching = '{}/{}'.format(self.doc_list_model.rowCount(), len(self.corpus))
-            self.n_matches = self.matches if self.matches else 'n/a'
-            self.n_tokens = sum(map(len, self.corpus.tokens)) if self.corpus.has_tokens() else 'n/a'
-            self.n_types = len(self.corpus.dictionary) if self.corpus.has_tokens() else 'n/a'
+            has_tokens = self.corpus.has_tokens()
+            self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.corpus)}"
+            self.n_tokens = sum(map(len, self.corpus.tokens)) if has_tokens else "n/a"
+            self.n_types = len(self.corpus.dictionary) if has_tokens else "n/a"
         else:
-            self.n_matching = ''
-            self.n_matches = ''
-            self.n_tokens = ''
-            self.n_types = ''
+            self.n_matching = "n/a"
+            self.n_matches = "n/a"
+            self.n_tokens = "n/a"
+            self.n_types = "n/a"
 
     @gui.deferred
     def commit(self):
         matched = unmatched = annotated_corpus = None
-        corpus = self.corpus
-        if corpus is not None:
-            # it returns a set of selected documents which are in view
-            selected_docs = self.get_selected_documents_from_view()
-            titles = corpus.titles
-            matched_mask = [
-                i for i, t in enumerate(titles) if t in selected_docs
-            ]
-            unmatched_mask = [
-                i for i, t in enumerate(titles) if t not in selected_docs
-            ]
-
-            matched = corpus[matched_mask] if len(matched_mask) else None
-            unmatched = corpus[unmatched_mask] if len(unmatched_mask) else None
-            annotated_corpus = create_annotated_table(corpus, matched_mask)
+        if self.corpus is not None:
+            selected_docs = sorted(self.get_selected_indexes())
+            matched = self.corpus[selected_docs] if selected_docs else None
+            mask = np.ones(len(self.corpus), bool)
+            mask[selected_docs] = 0
+            unmatched = self.corpus[mask] if mask.any() else None
+            annotated_corpus = create_annotated_table(self.corpus, selected_docs)
         self.Outputs.matching_docs.send(matched)
         self.Outputs.other_docs.send(unmatched)
         self.Outputs.corpus.send(annotated_corpus)
 
     def send_report(self):
-        self.report_items((
-            ("Query", self.regexp_filter),
-            ("Matching documents", self.n_matching),
-            ("Matches", self.n_matches)
-        ))
+        self.report_items(
+            (
+                ("Query", self.regexp_filter),
+                ("Matching documents", self.n_matching),
+                ("Matches", self.n_matches),
+            )
+        )
 
     def showEvent(self, event):
         super().showEvent(event)
@@ -460,18 +655,44 @@ def update_splitter(self):
         """
         w1, w2 = self.splitter.sizes()
         ws = w1 + w2
-        if w2 < 2/3 * ws:
-            self.splitter.setSizes([int(ws * 1/3), int(ws * 2/3)])
-
+        if w2 < 2 / 3 * ws:
+            self.splitter.setSizes([int(ws * 1 / 3), int(ws * 2 / 3)])
+
+    @classmethod
+    def migrate_context(cls, context, version):
+        if version < 2:
+            f_order = context.values.pop("display_features", None)
+            display_idx = context.values.pop("display_indices", [])
+            search_ids = context.values.pop("search_indices", [])
+            if f_order is not None:
+                f_order = f_order[0]
+                display_features = [f_order[i] for i in display_idx if i < len(f_order)]
+                search_features = [f_order[i] for i in search_ids if i < len(f_order)]
+                context.values["display_features"] = (display_features, -3)
+                context.values["search_features"] = (search_features, -3)
+
+            # old widget used PerfectDomainContextHandler with MATCH_VALUES_ALL
+            # now it uses DomainContextHandler. The difference are:
+            # - perfect handler stores values in tuple while domain in dicts
+            # - domain context handler store class_vars together with attributes
+            #   while perfect handler store them separately
+            # - since MATCH_VALUES_ALL was used discrete var values were stored
+            #   with var name (replacing them with id for discrete var - 1)
+            if hasattr(context, "class_vars"):
+                context.attributes = {
+                    attr: 1 if isinstance(v, list) else v
+                    for attr, v in context.attributes + context.class_vars
+                }
+                context.metas = dict(context.metas)
+                delattr(context, "class_vars")
+
+
+if __name__ == "__main__":
+    from orangewidget.utils.widgetpreview import WidgetPreview
 
-if __name__ == '__main__':
     from orangecontrib.text.preprocess import BASE_TOKENIZER
-    from orangecontrib.text.tag.pos import AveragedPerceptronTagger
-    from orangewidget.utils.widgetpreview import WidgetPreview
 
-    corpus = Corpus.from_file('book-excerpts')
-    corpus = corpus[:3]
-    tagger = AveragedPerceptronTagger()
-    tagged_corpus = tagger(BASE_TOKENIZER(corpus))
-    tagged_corpus.ngram_range = (1, 2)
-    WidgetPreview(OWCorpusViewer).run(tagged_corpus)
+    corpus_ = Corpus.from_file("book-excerpts")
+    corpus_ = corpus_[:3]
+    corpus_ = BASE_TOKENIZER(corpus_)
+    WidgetPreview(OWCorpusViewer).run(corpus_)
diff --git a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py
index 6d1847868..f19057885 100644
--- a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py
+++ b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py
@@ -1,18 +1,110 @@
 import unittest
+from unittest import TestCase
 
 import numpy as np
+from AnyQt.QtCore import QItemSelectionModel, Qt
 from AnyQt.QtTest import QSignalSpy
+from orangewidget.settings import Context
+
+from Orange.data import StringVariable, Domain
 from Orange.widgets.tests.base import WidgetTest
-from Orange.data import StringVariable
 
 from orangecontrib.text.corpus import Corpus
-from orangecontrib.text.widgets.owcorpusviewer import OWCorpusViewer
+from orangecontrib.text.preprocess import BASE_TOKENIZER
+from orangecontrib.text.widgets.owcorpusviewer import (
+    OWCorpusViewer,
+    DocumentListModel,
+    DocumentsFilterProxyModel,
+)
+
+
+class TestDocumentListModel(TestCase):
+    def test_empty(self):
+        model = DocumentListModel()
+        self.assertEqual(model.rowCount(), 0)
+        self.assertListEqual(model.get_filter_content(), [])
+
+    def test_data(self):
+        model = DocumentListModel()
+        documents = ["Doc 1", "Doc 2", "Doc 3"]
+        contents = ["bar", "foo", "bar foo"]
+        model.setup_data(documents, contents)
+
+        self.assertListEqual(model.get_filter_content(), contents)
+        self.assertEqual(model.rowCount(), 3)
+
+        self.assertEqual(model.data(model.index(0)), documents[0])
+        self.assertEqual(model.data(model.index(1)), documents[1])
+        self.assertEqual(model.data(model.index(2)), documents[2])
+
+    def test_data_method(self):
+        model = DocumentListModel()
+        documents = ["Doc 1", "Doc 2", "Doc 3"]
+        contents = ["bar", "foo", "bar foo"]
+        model.setup_data(documents, contents)
+
+        self.assertEqual(model.data(model.index(0), Qt.DisplayRole), documents[0])
+        self.assertEqual(model.data(model.index(1), Qt.DisplayRole), documents[1])
+        self.assertEqual(model.data(model.index(2), Qt.DisplayRole), documents[2])
+
+        self.assertEqual(model.data(model.index(0), Qt.UserRole), contents[0])
+        self.assertEqual(model.data(model.index(1), Qt.UserRole), contents[1])
+        self.assertEqual(model.data(model.index(2), Qt.UserRole), contents[2])
+
+        self.assertIsNone(model.data(model.index(2), Qt.BackgroundRole))
+
+    def test_update_filter_content(self):
+        model = DocumentListModel()
+        documents = ["Doc 1", "Doc 2", "Doc 3"]
+        contents = ["bar", "foo", "bar foo"]
+        model.setup_data(documents, contents)
+
+        model.update_filter_content(["a", "b", "c"])
+        self.assertEqual(model.data(model.index(0), Qt.UserRole), "a")
+        self.assertEqual(model.data(model.index(1), Qt.UserRole), "b")
+        self.assertEqual(model.data(model.index(2), Qt.UserRole), "c")
+
+        with self.assertRaises(AssertionError):
+            model.update_filter_content(
+                [
+                    "a",
+                    "b",
+                ]
+            )
+
+
+class TestFilterModel(TestCase):
+    def test_filter_model(self):
+        model = DocumentListModel()
+        filter_model = DocumentsFilterProxyModel()
+        filter_model.setSourceModel(model)
+        documents = ["Doc 1", "Doc 2", "Doc 3"]
+        contents = ["bar", "foo", "bar foo"]
+        model.setup_data(documents, contents)
+
+        # __regex is None - all data shown
+        self.assertEqual(filter_model.rowCount(), 3)
+        self.assertEqual(filter_model.data(filter_model.index(0, 0)), documents[0])
+        self.assertEqual(filter_model.data(filter_model.index(1, 0)), documents[1])
+        self.assertEqual(filter_model.data(filter_model.index(2, 0)), documents[2])
+
+        # with regex set
+        filter_model.set_filter_string("bar")
+        self.assertEqual(filter_model.rowCount(), 2)
+        self.assertEqual(filter_model.data(filter_model.index(0, 0)), documents[0])
+        self.assertEqual(filter_model.data(filter_model.index(1, 0)), documents[2])
+
+    def test_empty_model(self):
+        model = DocumentListModel()
+        filter_model = DocumentsFilterProxyModel()
+        filter_model.setSourceModel(model)
+        self.assertEqual(filter_model.rowCount(), 0)
 
 
 class TestCorpusViewerWidget(WidgetTest):
     def setUp(self):
         self.widget = self.create_widget(OWCorpusViewer)
-        self.corpus = Corpus.from_file('deerwester')
+        self.corpus = Corpus.from_file("deerwester")
 
     def test_data(self):
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
@@ -31,23 +123,23 @@ def test_search(self):
         self.process_events()
         out_corpus = self.get_output(self.widget.Outputs.matching_docs)
         self.assertEqual(len(out_corpus), 1)
-        self.assertEqual(self.widget.matches, 7)
+        self.assertEqual(self.widget.n_matches, 7)
 
         # first document is selected, when filter with word that is not in
-        # selected document out_corpus is None
+        # selected document, first of shown documents is selected
         self.widget.regexp_filter = "graph"
         self.widget.refresh_search()
         self.process_events()
-        out_corpus = self.get_output(self.widget.Outputs.matching_docs)
-        self.assertIsNone(out_corpus)
+        self.assertEqual(1, len(self.get_output(self.widget.Outputs.matching_docs)))
         # word count doesn't depend on selection
-        self.assertEqual(self.widget.matches, 7)
+        self.assertEqual(self.widget.n_matches, 7)
 
         # when filter is removed, matched words is 0
         self.widget.regexp_filter = ""
         self.widget.refresh_search()
         self.process_events()
-        self.assertEqual(self.widget.matches, 0)
+        self.wait_until_finished()
+        self.assertEqual(self.widget.n_matches, 0)
 
     def test_highlighting(self):
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
@@ -68,15 +160,11 @@ def test_highlighting(self):
         self.assertIn('<mark data-markjs="true">', html)
 
     def test_highlighting_non_latin(self):
-        documents = [
-            {
-                'content': """царстве есть сад с молодильными яблоками"""
-            }
-        ]
+        documents = [{"content": """царстве есть сад с молодильными яблоками"""}]
         metas = [
-            (StringVariable('content'), lambda doc: doc.get('content')),
+            (StringVariable("content"), lambda doc: doc.get("content")),
         ]
-        dataset_name = 'RussianDocument'
+        dataset_name = "RussianDocument"
         corpus = Corpus.from_documents(documents, dataset_name, metas=metas)
 
         self.send_signal(self.widget.Inputs.corpus, corpus)
@@ -90,56 +178,54 @@ def test_highlighting_non_latin(self):
         self.assertIn('<mark data-markjs="true">', html)
 
     def test_output(self):
-        """ Output is intersection between selection and filter """
+        """Output is intersection between selection and filter"""
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
         self.widget.regexp_filter = "graph"
         self.widget.refresh_search()
         self.process_events()
-        self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs))
+        # when intersection is empty automatically select first document shown
+        mathing = self.get_output(self.widget.Outputs.matching_docs)
+        self.assertEqual(1, len(mathing))
         self.assertEqual(
-            9, len(self.get_output(self.widget.Outputs.other_docs))
+            mathing.get_column_view("Text")[0][0],
+            "The generation of random binary unordered trees",
         )
+        self.assertEqual(8, len(self.get_output(self.widget.Outputs.other_docs)))
         self.assertEqual(
             len(self.corpus.domain.metas) + 1,
-            len(self.get_output(self.widget.Outputs.corpus).domain.metas)
+            len(self.get_output(self.widget.Outputs.corpus).domain.metas),
         )
 
         self.widget.doc_list.selectAll()  # selects current documents in list
-        self.assertEqual(
-            4, len(self.get_output(self.widget.Outputs.matching_docs))
-        )
-        self.assertEqual(
-            5, len(self.get_output(self.widget.Outputs.other_docs))
-        )
+        self.assertEqual(4, len(self.get_output(self.widget.Outputs.matching_docs)))
+        self.assertEqual(5, len(self.get_output(self.widget.Outputs.other_docs)))
         output = self.get_output(self.widget.Outputs.corpus)
         self.assertEqual(
             len(self.get_output(self.widget.Outputs.matching_docs)),
-            sum(output.get_column_view("Selected")[0])
+            sum(output.get_column_view("Selected")[0]),
         )
 
         self.widget.regexp_filter = "human"
         self.widget.refresh_search()
         self.process_events()
-        # empty because none of matching documents is selected
-        self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs))
+        # when intersection is empty automatically select first document shown
+        mathing = self.get_output(self.widget.Outputs.matching_docs)
+        self.assertEqual(1, len(mathing))
         self.assertEqual(
-            9, len(self.get_output(self.widget.Outputs.other_docs))
+            mathing.get_column_view("Text")[0][0],
+            "Human machine interface for lab abc computer applications",
         )
+        self.assertEqual(8, len(self.get_output(self.widget.Outputs.other_docs)))
         output = self.get_output(self.widget.Outputs.corpus)
-        self.assertEqual(0,
-                         sum(output.get_column_view("Selected")[0]))
+        self.assertEqual(1, sum(output.get_column_view("Selected")[0]))
 
         self.widget.doc_list.selectAll()
-        self.assertEqual(
-            5, len(self.get_output(self.widget.Outputs.matching_docs))
-        )
-        self.assertEqual(
-            4, len(self.get_output(self.widget.Outputs.other_docs))
-        )
+        self.assertEqual(5, len(self.get_output(self.widget.Outputs.matching_docs)))
+        self.assertEqual(4, len(self.get_output(self.widget.Outputs.other_docs)))
         output = self.get_output(self.widget.Outputs.corpus)
         self.assertEqual(
             len(self.get_output(self.widget.Outputs.matching_docs)),
-            sum(output.get_column_view("Selected")[0])
+            sum(output.get_column_view("Selected")[0]),
         )
 
         self.send_signal(self.widget.Inputs.corpus, None)
@@ -149,7 +235,7 @@ def test_output(self):
 
     def test_empty_corpus(self):
         self.send_signal(self.widget.Inputs.corpus, self.corpus[:0])
-        self.assertListEqual(self.widget.selected_documents, [])
+        self.assertSetEqual(self.widget.selected_documents, set())
         self.assertEqual(self.widget.doc_list.model().rowCount(), 0)
 
     def test_report(self):
@@ -159,6 +245,120 @@ def test_report(self):
         self.process_events()
         self.widget.send_report()
 
+    def test_filter_attributes(self):
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.widget.filter_input.setText("graph")
+        self.widget.refresh_search()
+
+        # all attributes used for filtering (shown documents with "graph" in Category)
+        doc_model = self.widget.doc_list.model()
+        doc_shown = [
+            doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount())
+        ]
+        self.assertListEqual(
+            doc_shown, ["Document 6", "Document 7", "Document 8", "Document 9"]
+        )
+
+        # only "Text" used for filtering (shown documents with "graph" in Text)
+        slv = self.widget.search_listbox
+        slv.selectionModel().select(
+            slv.model().index(1), QItemSelectionModel.ClearAndSelect
+        )
+        doc_shown = [
+            doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount())
+        ]
+        self.assertListEqual(doc_shown, ["Document 7", "Document 8", "Document 9"])
+
+    def test_filters_restored_from_context(self):
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.widget.filter_input.setText("graph")
+        self.widget.refresh_search()
+        slv = self.widget.search_listbox
+        slv.selectionModel().select(
+            slv.model().index(1), QItemSelectionModel.ClearAndSelect
+        )
+        self.assertListEqual(self.widget.search_features, [self.corpus.domain["Text"]])
+
+        # send some other data to change values
+        temp_corpus = Corpus.from_file("andersen")
+        self.send_signal(self.widget.Inputs.corpus, temp_corpus)
+        self.assertListEqual(self.widget.search_features, list(temp_corpus.domain))
+
+        # test if corpus correctly restored for search_features
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.assertListEqual(self.widget.search_features, [self.corpus.domain["Text"]])
+        self.assertEqual(self.widget.regexp_filter, "graph")
+
+        # check if restored values correctly used for filtering
+        # filter_conent must include only values from the text column
+        self.assertListEqual(
+            self.widget.doc_list_model.get_filter_content(),
+            self.corpus.get_column_view("Text")[0].tolist(),
+        )
+        # only "Text" used for filtering (shown documents with "graph" in Text)
+        doc_model = self.widget.doc_list.model()
+        doc_shown = [
+            doc_model.data(doc_model.index(i, 0)) for i in range(doc_model.rowCount())
+        ]
+        self.assertListEqual(doc_shown, ["Document 7", "Document 8", "Document 9"])
+
+    def test_data_only_hidden_attributes(self):
+        for a in self.corpus.domain:
+            a.attributes["hidden"] = True
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        # documents are shown but filter does not work
+        self.assertEqual(self.widget.doc_list_model.rowCount(), 9)
+
+    def test_token_checkbox(self):
+        corpus_tokens = BASE_TOKENIZER(self.corpus)
+        self.send_signal(self.widget.Inputs.corpus, corpus_tokens)
+        self.assertTrue(self.widget.show_tokens_checkbox.isEnabled())
+        self.assertFalse(self.widget.show_tokens_checkbox.isChecked())
+
+        self.widget.show_tokens_checkbox.setChecked(True)
+        self.assertTrue(self.widget.show_tokens_checkbox.isChecked())
+
+        # if corpus without tokens on the input button is dissabled and unchecked
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.assertFalse(self.widget.show_tokens_checkbox.isChecked())
+        self.assertFalse(self.widget.show_tokens_checkbox.isEnabled())
+
+    def test_image(self):
+        im_attr = StringVariable("Image")
+        im_attr.attributes["origin"] = "/path/to/image"
+        im_attr.attributes["type"] = "image"
+        domain = self.corpus.domain
+        im_corpus = self.corpus.transform(
+            Domain(domain.attributes, metas=domain.metas + (im_attr,))
+        )
+        with im_corpus.unlocked(im_corpus.metas):
+            im_corpus[:, im_attr] = np.array(["image_name"] + [""] * 8).reshape(-1, 1)
+        self.send_signal(self.widget.Inputs.corpus, im_corpus)
+        # tried to get content from the view to test correctness and cannot find
+        # a nice way also patching does not work on all systems, just testing
+        # that having image in corpus does not fail
+
+    def test_migrate_settings(self):
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        packed_data = self.widget.settingsHandler.pack_data(self.widget)
+        context = packed_data["context_settings"][0]
+        # we borrow display_features from setting extracted from widget, it
+        # contains Category and Text
+        context.values["display_indices"] = [0]
+        context.values["search_indices"] = [1]
+        context.values["__version__"] = 1
+        context.attributes = tuple(context.attributes.items())
+        context.attributes = context.attributes
+        self.widget = self.create_widget(
+            OWCorpusViewer,
+            stored_settings={"context_settings": [context], "__version__": 1},
+        )
+        self.send_signal(self.widget.Inputs.corpus, self.corpus, widget=self.widget)
+        domain = self.corpus.domain
+        self.assertListEqual(self.widget.display_features, [domain["Category"]])
+        self.assertListEqual(self.widget.search_features, [domain["Text"]])
+
+
 
 if __name__ == "__main__":
     unittest.main()