From 0d8973dd8060f8a1b63ac2415f55971bf00ec077 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 23 Feb 2024 11:36:45 +0100 Subject: [PATCH] Keywords - language from corpus --- orangecontrib/text/widgets/owkeywords.py | 73 ++++++++++++++++-------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/orangecontrib/text/widgets/owkeywords.py b/orangecontrib/text/widgets/owkeywords.py index 75c0d22ba..019a8861c 100644 --- a/orangecontrib/text/widgets/owkeywords.py +++ b/orangecontrib/text/widgets/owkeywords.py @@ -22,12 +22,17 @@ from orangecontrib.text import Corpus from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \ YAKE_LANGUAGES, RAKE_LANGUAGES -from orangecontrib.text.language import ISO2LANG, LANG2ISO +from orangecontrib.text.language import ISO2LANG, LANG2ISO, LanguageModel from orangecontrib.text.preprocess import BaseNormalizer from orangecontrib.text.widgets.utils import enum2int from orangecontrib.text.widgets.utils.words import create_words_table, \ WORDS_COLUMN_NAME +CONNECTION_WARNING = ( + f"{ScoringMethods.MBERT} could not extract keywords from some " + "documents due to connection error. Please rerun keyword extraction." +) + class Results(SimpleNamespace): # currently wanted (aggregated) scores @@ -36,6 +41,8 @@ class Results(SimpleNamespace): labels: List[str] = [] # all calculated keywords {method: [[(word1, score1), ...]]} all_keywords: Dict[str, List[List[Tuple[str, float]]]] = {} + # warnings happening during keyword extraction process + warnings: List[str] = [] def run( @@ -47,7 +54,7 @@ def run( agg_method: int, state: TaskState ) -> Results: - results = Results(scores=[], labels=[], all_keywords={}) + results = Results(scores=[], labels=[], all_keywords={}, warnings=[]) if not corpus: return results @@ -69,7 +76,8 @@ def callback(i: float, status=""): step = 1 / len(scoring_methods) for method_name, func in ScoringMethods.ITEMS: if method_name in scoring_methods: - if method_name not in results.all_keywords: + keywords = results.all_keywords.get(method_name) + if keywords is None: i = len(results.labels) cb = wrap_callback(callback, start=i * step, end=(i + 1) * step) @@ -78,10 +86,20 @@ def callback(i: float, status=""): kw = {"progress_callback": cb} kw.update(scoring_methods_kwargs.get(method_name, {})) - keywords = func(corpus if needs_tokens else documents, **kw) - results.all_keywords[method_name] = keywords + kws = func(corpus if needs_tokens else documents, **kw) + # None means that embedding completely failed on document + # currently it only happens with mbert when connection fails + keywords = [kw for kw in kws if kw is not None] + # don't store keywords to all_keywords if any were not computed + # due to connection issues; storing them would cause that + # missing keywords would not be recomputed on next run + # mbert's existing keywords are cached in embedding cache + # only missing will be recomputed + if len(kws) > len(keywords) and method_name == ScoringMethods.MBERT: + results.warnings.append(CONNECTION_WARNING) + else: + results.all_keywords[method_name] = keywords - keywords = results.all_keywords[method_name] scores[method_name] = \ dict(AggregationMethods.aggregate(keywords, agg_method)) @@ -93,12 +111,10 @@ def callback(i: float, status=""): # Normalize words for preprocessor in corpus.used_preprocessor.preprocessors: if isinstance(preprocessor, BaseNormalizer): - # todo: language dummy = Corpus.from_numpy( Domain((), metas=[StringVariable("Words")]), X=np.empty((len(words), 0)), - metas=np.array(words)[:, None], - language=corpus.language, + metas=np.array(words)[:, None] ) words = list(preprocessor(dummy).tokens.flatten()) @@ -183,7 +199,7 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin): description = "Infers characteristic words from the input corpus." icon = "icons/Keywords.svg" priority = 1100 - keywords = ["characteristic", "term"] + keywords = "extract keywords, characteristic, term" buttons_area_orientation = Qt.Vertical settings_version = 2 @@ -191,12 +207,11 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin): # Qt.DescendingOrder is IntEnum in PyQt5 and Enum in PyQt6 (both have value attr) # in setting we want to save integer and not Enum object (in case of PyQt6) DEFAULT_SORTING = (1, enum2int(Qt.DescendingOrder)) - DEFAULT_LANGUAGE = "English" settingsHandler = DomainContextHandler() selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF}) - yake_language: str = Setting(DEFAULT_LANGUAGE) - rake_language: str = Setting(DEFAULT_LANGUAGE) + yake_language: str = Setting("en") + rake_language: str = Setting("en") agg_method: int = Setting(AggregationMethods.MEAN) sel_method: int = ContextSetting(SelectionMethods.N_BEST) n_selected: int = ContextSetting(3) @@ -213,6 +228,7 @@ class Outputs: class Warning(OWWidget.Warning): no_words_column = Msg("Input is missing 'Words' column.") + extraction_warnings = Msg("{}") def __init__(self): OWWidget.__init__(self) @@ -228,13 +244,17 @@ def _setup_gui(self): box = gui.widgetBox(self.controlArea, "Scoring Methods", grid) yake_cb = gui.comboBox( - self.controlArea, self, "yake_lang_index", items=[ISO2LANG[lg] for lg in YAKE_LANGUAGES], - sendSelectedValue=True, # value is actual string not index + self.controlArea, + self, + "yake_language", + model=LanguageModel(include_none=False, languages=YAKE_LANGUAGES), callback=self.__on_yake_lang_changed ) rake_cb = gui.comboBox( - self.controlArea, self, "rake_lang_index", items=[ISO2LANG[lg] for lg in RAKE_LANGUAGES], - sendSelectedValue=True, # value is actual string not index + self.controlArea, + self, + "rake_language", + model=LanguageModel(include_none=False, languages=RAKE_LANGUAGES), callback=self.__on_rake_lang_changed ) @@ -268,7 +288,7 @@ def _setup_gui(self): button.setChecked(method == self.sel_method) grid.addWidget(button, method, 0) self.__sel_method_buttons.addButton(button, method) - self.__sel_method_buttons.buttonClicked.connect(self._set_selection_method) + self.__sel_method_buttons.idClicked.connect(self._set_selection_method) spin = gui.spin( box, self, "n_selected", 1, 999, addToLayout=False, @@ -383,20 +403,20 @@ def handleNewSignals(self): def update_scores(self): kwargs = { ScoringMethods.YAKE: { - "language": LANG2ISO[self.yake_lang_index], + "language": self.yake_language, "max_len": self.corpus.ngram_range[1] if self.corpus else 1 }, ScoringMethods.RAKE: { - "language": LANG2ISO[self.rake_lang_index], + "language": self.rake_language, "max_len": self.corpus.ngram_range[1] if self.corpus else 1 }, } self.start(run, self.corpus, self.words, self.__cached_keywords, self.selected_scoring_methods, kwargs, self.agg_method) - def _set_selection_method(self): - self.sel_method = self.__sel_method_buttons.checkedId() - self.__sel_method_buttons.button(self.sel_method).setChecked(True) + def _set_selection_method(self, method: int): + self.sel_method = method + self.__sel_method_buttons.button(method).setChecked(True) self._select_rows() def _select_rows(self): @@ -446,6 +466,8 @@ def on_done(self, results: Results): self._select_rows() else: self.__on_selection_changed() + if results.warnings: + self.Warning.extraction_warnings("\n".join(results.warnings)) def _apply_sorting(self): if self.model.columnCount() <= self.sort_column_order[0]: @@ -497,8 +519,9 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]): if version is None or version < 2: # before version 2 settings were indexes now they are strings # with language name and selected aggregator name - settings["yake_language"] = YAKE_LANGUAGES[settings["language"]] - settings["rake_language"] = RAKE_LANGUAGES[settings["aggregator"]] + settings["yake_language"] = YAKE_LANGUAGES[settings["yake_lang_index"]] + # todo + settings["rake_language"] = RAKE_LANGUAGES[settings["rake_lang_index"]] if __name__ == "__main__":