Skip to content

Commit

Permalink
Keywords - language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Feb 23, 2024
1 parent 7a3b999 commit 0d8973d
Showing 1 changed file with 48 additions and 25 deletions.
73 changes: 48 additions & 25 deletions orangecontrib/text/widgets/owkeywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,17 @@
from orangecontrib.text import Corpus
from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \
YAKE_LANGUAGES, RAKE_LANGUAGES
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.language import ISO2LANG, LANG2ISO, LanguageModel
from orangecontrib.text.preprocess import BaseNormalizer
from orangecontrib.text.widgets.utils import enum2int
from orangecontrib.text.widgets.utils.words import create_words_table, \
WORDS_COLUMN_NAME

CONNECTION_WARNING = (
f"{ScoringMethods.MBERT} could not extract keywords from some "
"documents due to connection error. Please rerun keyword extraction."
)


class Results(SimpleNamespace):
# currently wanted (aggregated) scores
Expand All @@ -36,6 +41,8 @@ class Results(SimpleNamespace):
labels: List[str] = []
# all calculated keywords {method: [[(word1, score1), ...]]}
all_keywords: Dict[str, List[List[Tuple[str, float]]]] = {}
# warnings happening during keyword extraction process
warnings: List[str] = []


def run(
Expand All @@ -47,7 +54,7 @@ def run(
agg_method: int,
state: TaskState
) -> Results:
results = Results(scores=[], labels=[], all_keywords={})
results = Results(scores=[], labels=[], all_keywords={}, warnings=[])
if not corpus:
return results

Expand All @@ -69,7 +76,8 @@ def callback(i: float, status=""):
step = 1 / len(scoring_methods)
for method_name, func in ScoringMethods.ITEMS:
if method_name in scoring_methods:
if method_name not in results.all_keywords:
keywords = results.all_keywords.get(method_name)
if keywords is None:
i = len(results.labels)
cb = wrap_callback(callback, start=i * step,
end=(i + 1) * step)
Expand All @@ -78,10 +86,20 @@ def callback(i: float, status=""):
kw = {"progress_callback": cb}
kw.update(scoring_methods_kwargs.get(method_name, {}))

keywords = func(corpus if needs_tokens else documents, **kw)
results.all_keywords[method_name] = keywords
kws = func(corpus if needs_tokens else documents, **kw)
# None means that embedding completely failed on document
# currently it only happens with mbert when connection fails
keywords = [kw for kw in kws if kw is not None]
# don't store keywords to all_keywords if any were not computed
# due to connection issues; storing them would cause that
# missing keywords would not be recomputed on next run
# mbert's existing keywords are cached in embedding cache
# only missing will be recomputed
if len(kws) > len(keywords) and method_name == ScoringMethods.MBERT:
results.warnings.append(CONNECTION_WARNING)
else:
results.all_keywords[method_name] = keywords

keywords = results.all_keywords[method_name]
scores[method_name] = \
dict(AggregationMethods.aggregate(keywords, agg_method))

Expand All @@ -93,12 +111,10 @@ def callback(i: float, status=""):
# Normalize words
for preprocessor in corpus.used_preprocessor.preprocessors:
if isinstance(preprocessor, BaseNormalizer):
# todo: language
dummy = Corpus.from_numpy(
Domain((), metas=[StringVariable("Words")]),
X=np.empty((len(words), 0)),
metas=np.array(words)[:, None],
language=corpus.language,
metas=np.array(words)[:, None]
)
words = list(preprocessor(dummy).tokens.flatten())

Expand Down Expand Up @@ -183,20 +199,19 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin):
description = "Infers characteristic words from the input corpus."
icon = "icons/Keywords.svg"
priority = 1100
keywords = ["characteristic", "term"]
keywords = "extract keywords, characteristic, term"

buttons_area_orientation = Qt.Vertical
settings_version = 2

# Qt.DescendingOrder is IntEnum in PyQt5 and Enum in PyQt6 (both have value attr)
# in setting we want to save integer and not Enum object (in case of PyQt6)
DEFAULT_SORTING = (1, enum2int(Qt.DescendingOrder))
DEFAULT_LANGUAGE = "English"

settingsHandler = DomainContextHandler()
selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF})
yake_language: str = Setting(DEFAULT_LANGUAGE)
rake_language: str = Setting(DEFAULT_LANGUAGE)
yake_language: str = Setting("en")
rake_language: str = Setting("en")
agg_method: int = Setting(AggregationMethods.MEAN)
sel_method: int = ContextSetting(SelectionMethods.N_BEST)
n_selected: int = ContextSetting(3)
Expand All @@ -213,6 +228,7 @@ class Outputs:

class Warning(OWWidget.Warning):
no_words_column = Msg("Input is missing 'Words' column.")
extraction_warnings = Msg("{}")

def __init__(self):
OWWidget.__init__(self)
Expand All @@ -228,13 +244,17 @@ def _setup_gui(self):
box = gui.widgetBox(self.controlArea, "Scoring Methods", grid)

yake_cb = gui.comboBox(
self.controlArea, self, "yake_lang_index", items=[ISO2LANG[lg] for lg in YAKE_LANGUAGES],
sendSelectedValue=True, # value is actual string not index
self.controlArea,
self,
"yake_language",
model=LanguageModel(include_none=False, languages=YAKE_LANGUAGES),
callback=self.__on_yake_lang_changed
)
rake_cb = gui.comboBox(
self.controlArea, self, "rake_lang_index", items=[ISO2LANG[lg] for lg in RAKE_LANGUAGES],
sendSelectedValue=True, # value is actual string not index
self.controlArea,
self,
"rake_language",
model=LanguageModel(include_none=False, languages=RAKE_LANGUAGES),
callback=self.__on_rake_lang_changed
)

Expand Down Expand Up @@ -268,7 +288,7 @@ def _setup_gui(self):
button.setChecked(method == self.sel_method)
grid.addWidget(button, method, 0)
self.__sel_method_buttons.addButton(button, method)
self.__sel_method_buttons.buttonClicked.connect(self._set_selection_method)
self.__sel_method_buttons.idClicked.connect(self._set_selection_method)

spin = gui.spin(
box, self, "n_selected", 1, 999, addToLayout=False,
Expand Down Expand Up @@ -383,20 +403,20 @@ def handleNewSignals(self):
def update_scores(self):
kwargs = {
ScoringMethods.YAKE: {
"language": LANG2ISO[self.yake_lang_index],
"language": self.yake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1
},
ScoringMethods.RAKE: {
"language": LANG2ISO[self.rake_lang_index],
"language": self.rake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1
},
}
self.start(run, self.corpus, self.words, self.__cached_keywords,
self.selected_scoring_methods, kwargs, self.agg_method)

def _set_selection_method(self):
self.sel_method = self.__sel_method_buttons.checkedId()
self.__sel_method_buttons.button(self.sel_method).setChecked(True)
def _set_selection_method(self, method: int):
self.sel_method = method
self.__sel_method_buttons.button(method).setChecked(True)
self._select_rows()

def _select_rows(self):
Expand Down Expand Up @@ -446,6 +466,8 @@ def on_done(self, results: Results):
self._select_rows()
else:
self.__on_selection_changed()
if results.warnings:
self.Warning.extraction_warnings("\n".join(results.warnings))

def _apply_sorting(self):
if self.model.columnCount() <= self.sort_column_order[0]:
Expand Down Expand Up @@ -497,8 +519,9 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
if version is None or version < 2:
# before version 2 settings were indexes now they are strings
# with language name and selected aggregator name
settings["yake_language"] = YAKE_LANGUAGES[settings["language"]]
settings["rake_language"] = RAKE_LANGUAGES[settings["aggregator"]]
settings["yake_language"] = YAKE_LANGUAGES[settings["yake_lang_index"]]
# todo
settings["rake_language"] = RAKE_LANGUAGES[settings["rake_lang_index"]]


if __name__ == "__main__":
Expand Down

0 comments on commit 0d8973d

Please sign in to comment.