From 0d8973dd8060f8a1b63ac2415f55971bf00ec077 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 23 Feb 2024 11:36:45 +0100
Subject: [PATCH] Keywords - language from corpus
---
orangecontrib/text/widgets/owkeywords.py | 73 ++++++++++++++++--------
1 file changed, 48 insertions(+), 25 deletions(-)
diff --git a/orangecontrib/text/widgets/owkeywords.py b/orangecontrib/text/widgets/owkeywords.py
index 75c0d22ba..019a8861c 100644
--- a/orangecontrib/text/widgets/owkeywords.py
+++ b/orangecontrib/text/widgets/owkeywords.py
@@ -22,12 +22,17 @@
from orangecontrib.text import Corpus
from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \
YAKE_LANGUAGES, RAKE_LANGUAGES
-from orangecontrib.text.language import ISO2LANG, LANG2ISO
+from orangecontrib.text.language import ISO2LANG, LANG2ISO, LanguageModel
from orangecontrib.text.preprocess import BaseNormalizer
from orangecontrib.text.widgets.utils import enum2int
from orangecontrib.text.widgets.utils.words import create_words_table, \
WORDS_COLUMN_NAME
+CONNECTION_WARNING = (
+ f"{ScoringMethods.MBERT} could not extract keywords from some "
+ "documents due to connection error. Please rerun keyword extraction."
+)
+
class Results(SimpleNamespace):
# currently wanted (aggregated) scores
@@ -36,6 +41,8 @@ class Results(SimpleNamespace):
labels: List[str] = []
# all calculated keywords {method: [[(word1, score1), ...]]}
all_keywords: Dict[str, List[List[Tuple[str, float]]]] = {}
+ # warnings happening during keyword extraction process
+ warnings: List[str] = []
def run(
@@ -47,7 +54,7 @@ def run(
agg_method: int,
state: TaskState
) -> Results:
- results = Results(scores=[], labels=[], all_keywords={})
+ results = Results(scores=[], labels=[], all_keywords={}, warnings=[])
if not corpus:
return results
@@ -69,7 +76,8 @@ def callback(i: float, status=""):
step = 1 / len(scoring_methods)
for method_name, func in ScoringMethods.ITEMS:
if method_name in scoring_methods:
- if method_name not in results.all_keywords:
+ keywords = results.all_keywords.get(method_name)
+ if keywords is None:
i = len(results.labels)
cb = wrap_callback(callback, start=i * step,
end=(i + 1) * step)
@@ -78,10 +86,20 @@ def callback(i: float, status=""):
kw = {"progress_callback": cb}
kw.update(scoring_methods_kwargs.get(method_name, {}))
- keywords = func(corpus if needs_tokens else documents, **kw)
- results.all_keywords[method_name] = keywords
+ kws = func(corpus if needs_tokens else documents, **kw)
+ # None means that embedding completely failed on document
+ # currently it only happens with mbert when connection fails
+ keywords = [kw for kw in kws if kw is not None]
+ # don't store keywords to all_keywords if any were not computed
+ # due to connection issues; storing them would cause that
+ # missing keywords would not be recomputed on next run
+ # mbert's existing keywords are cached in embedding cache
+ # only missing will be recomputed
+ if len(kws) > len(keywords) and method_name == ScoringMethods.MBERT:
+ results.warnings.append(CONNECTION_WARNING)
+ else:
+ results.all_keywords[method_name] = keywords
- keywords = results.all_keywords[method_name]
scores[method_name] = \
dict(AggregationMethods.aggregate(keywords, agg_method))
@@ -93,12 +111,10 @@ def callback(i: float, status=""):
# Normalize words
for preprocessor in corpus.used_preprocessor.preprocessors:
if isinstance(preprocessor, BaseNormalizer):
- # todo: language
dummy = Corpus.from_numpy(
Domain((), metas=[StringVariable("Words")]),
X=np.empty((len(words), 0)),
- metas=np.array(words)[:, None],
- language=corpus.language,
+ metas=np.array(words)[:, None]
)
words = list(preprocessor(dummy).tokens.flatten())
@@ -183,7 +199,7 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin):
description = "Infers characteristic words from the input corpus."
icon = "icons/Keywords.svg"
priority = 1100
- keywords = ["characteristic", "term"]
+ keywords = "extract keywords, characteristic, term"
buttons_area_orientation = Qt.Vertical
settings_version = 2
@@ -191,12 +207,11 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin):
# Qt.DescendingOrder is IntEnum in PyQt5 and Enum in PyQt6 (both have value attr)
# in setting we want to save integer and not Enum object (in case of PyQt6)
DEFAULT_SORTING = (1, enum2int(Qt.DescendingOrder))
- DEFAULT_LANGUAGE = "English"
settingsHandler = DomainContextHandler()
selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF})
- yake_language: str = Setting(DEFAULT_LANGUAGE)
- rake_language: str = Setting(DEFAULT_LANGUAGE)
+ yake_language: str = Setting("en")
+ rake_language: str = Setting("en")
agg_method: int = Setting(AggregationMethods.MEAN)
sel_method: int = ContextSetting(SelectionMethods.N_BEST)
n_selected: int = ContextSetting(3)
@@ -213,6 +228,7 @@ class Outputs:
class Warning(OWWidget.Warning):
no_words_column = Msg("Input is missing 'Words' column.")
+ extraction_warnings = Msg("{}")
def __init__(self):
OWWidget.__init__(self)
@@ -228,13 +244,17 @@ def _setup_gui(self):
box = gui.widgetBox(self.controlArea, "Scoring Methods", grid)
yake_cb = gui.comboBox(
- self.controlArea, self, "yake_lang_index", items=[ISO2LANG[lg] for lg in YAKE_LANGUAGES],
- sendSelectedValue=True, # value is actual string not index
+ self.controlArea,
+ self,
+ "yake_language",
+ model=LanguageModel(include_none=False, languages=YAKE_LANGUAGES),
callback=self.__on_yake_lang_changed
)
rake_cb = gui.comboBox(
- self.controlArea, self, "rake_lang_index", items=[ISO2LANG[lg] for lg in RAKE_LANGUAGES],
- sendSelectedValue=True, # value is actual string not index
+ self.controlArea,
+ self,
+ "rake_language",
+ model=LanguageModel(include_none=False, languages=RAKE_LANGUAGES),
callback=self.__on_rake_lang_changed
)
@@ -268,7 +288,7 @@ def _setup_gui(self):
button.setChecked(method == self.sel_method)
grid.addWidget(button, method, 0)
self.__sel_method_buttons.addButton(button, method)
- self.__sel_method_buttons.buttonClicked.connect(self._set_selection_method)
+ self.__sel_method_buttons.idClicked.connect(self._set_selection_method)
spin = gui.spin(
box, self, "n_selected", 1, 999, addToLayout=False,
@@ -383,20 +403,20 @@ def handleNewSignals(self):
def update_scores(self):
kwargs = {
ScoringMethods.YAKE: {
- "language": LANG2ISO[self.yake_lang_index],
+ "language": self.yake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1
},
ScoringMethods.RAKE: {
- "language": LANG2ISO[self.rake_lang_index],
+ "language": self.rake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1
},
}
self.start(run, self.corpus, self.words, self.__cached_keywords,
self.selected_scoring_methods, kwargs, self.agg_method)
- def _set_selection_method(self):
- self.sel_method = self.__sel_method_buttons.checkedId()
- self.__sel_method_buttons.button(self.sel_method).setChecked(True)
+ def _set_selection_method(self, method: int):
+ self.sel_method = method
+ self.__sel_method_buttons.button(method).setChecked(True)
self._select_rows()
def _select_rows(self):
@@ -446,6 +466,8 @@ def on_done(self, results: Results):
self._select_rows()
else:
self.__on_selection_changed()
+ if results.warnings:
+ self.Warning.extraction_warnings("\n".join(results.warnings))
def _apply_sorting(self):
if self.model.columnCount() <= self.sort_column_order[0]:
@@ -497,8 +519,9 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
if version is None or version < 2:
# before version 2 settings were indexes now they are strings
# with language name and selected aggregator name
- settings["yake_language"] = YAKE_LANGUAGES[settings["language"]]
- settings["rake_language"] = RAKE_LANGUAGES[settings["aggregator"]]
+ settings["yake_language"] = YAKE_LANGUAGES[settings["yake_lang_index"]]
+ # todo
+ settings["rake_language"] = RAKE_LANGUAGES[settings["rake_lang_index"]]
if __name__ == "__main__":