From 475b87c1ff8d0927516ccf086277d5b260aeba3d Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 15 Oct 2024 15:04:39 +0200 Subject: [PATCH] fix: make taxonomy translations consistent they were random as we pick one tranlations among synonyms --- app/_import.py | 21 ++++++++++++--------- app/facets.py | 25 ++++++++++++++++--------- app/indexing.py | 12 +++++++++++- app/query.py | 2 +- app/taxonomy_es.py | 4 ++-- 5 files changed, 42 insertions(+), 22 deletions(-) diff --git a/app/_import.py b/app/_import.py index 6c95d564..bd16276f 100644 --- a/app/_import.py +++ b/app/_import.py @@ -263,21 +263,24 @@ def gen_taxonomy_documents( """ for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)): for node in taxonomy.iter_nodes(): - names = {} - for lang in supported_langs: - lang_names = set() - if lang in node.names: - lang_names.add(node.names[lang]) - if lang in node.synonyms: - lang_names |= set(node.synonyms[lang]) - names[lang] = list(lang_names) + names = { + lang: lang_names + for lang, lang_names in node.names.items() + if lang in supported_langs + } + synonyms = { + lang: lang_names + for lang, lang_names in node.synonyms.items() + if lang in supported_langs + } yield { "_index": next_index, "_source": { "id": node.id, "taxonomy_name": taxonomy_name, - "names": names, + "name": names, + "synonyms": synonyms, }, } diff --git a/app/facets.py b/app/facets.py index 4b633a62..d7df76ad 100644 --- a/app/facets.py +++ b/app/facets.py @@ -17,6 +17,13 @@ def _get_translations( lang: str, items: list[tuple[str, str]], index_config: config.IndexConfig ) -> dict[tuple[str, str], str]: + """Get translations for a list of items + + :param lang: target language + :param items: list of (entry id, field_name) + :param index_config: the index configuration + :return: a dict mapping (id, field_name) to the translation + """ # go from field_name to taxonomy field_names = set([field_name for _, field_name in items]) field_taxonomy: dict[str, str] = { @@ -25,7 +32,7 @@ def _get_translations( for field_name in field_names if index_config.fields[field_name].taxonomy_name } - # fetch items names + # fetch items names within a single query items_to_fetch = [ (id, field_taxonomy[field_name]) for id, field_name in items @@ -35,24 +42,24 @@ def _get_translations( # compute best translations translations: dict[tuple[str, str], str] = {} for id, field_name in items: - item_translations = None + item_translation = None names = ( items_names.get((id, field_taxonomy[field_name])) if field_name in field_taxonomy else None ) if names: - item_translations = names.get(lang, None) + item_translation = names.get(lang, None) # fold back to main language for item - if not item_translations: + if not item_translation: main_lang = id.split(":", 1)[0] - item_translations = names.get(main_lang, None) + item_translation = names.get(main_lang, None) # fold back to english - if not translations: - item_translations = names.get("en", None) + if not item_translation: + item_translation = names.get("en", None) # eventually translate - if item_translations: - translations[(id, field_name)] = item_translations[0] + if item_translation: + translations[(id, field_name)] = item_translation return translations diff --git a/app/indexing.py b/app/indexing.py index 7d0ed297..e79a473d 100644 --- a/app/indexing.py +++ b/app/indexing.py @@ -356,7 +356,17 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping: mapping.field("id", dsl_field.Keyword(required=True)) mapping.field("taxonomy_name", dsl_field.Keyword(required=True)) mapping.field( - "names", + "name", + dsl_field.Object( + required=True, + dynamic=False, + properties={ + lang: dsl_field.Keyword(required=False) for lang in supported_langs + }, + ), + ), + mapping.field( + "synonyms", dsl_field.Object( required=True, dynamic=False, diff --git a/app/query.py b/app/query.py index 4f3ee3a8..7e58c494 100644 --- a/app/query.py +++ b/app/query.py @@ -339,7 +339,7 @@ def build_completion_query( """ completion_clause = { - "field": f"names.{lang}", + "field": f"synonyms.{lang}", "size": size, "contexts": {"taxonomy_name": taxonomy_names}, } diff --git a/app/taxonomy_es.py b/app/taxonomy_es.py index ba1d08b7..a8f713e4 100644 --- a/app/taxonomy_es.py +++ b/app/taxonomy_es.py @@ -20,7 +20,7 @@ def get_taxonomy_names( items: list[tuple[str, str]], config: IndexConfig, -) -> dict[tuple[str, str], dict[str, list[str]]]: +) -> dict[tuple[str, str], dict[str, str]]: """Given a set of terms in different taxonomies, return their names""" filters = [] for id, taxonomy_name in items: @@ -32,7 +32,7 @@ def get_taxonomy_names( .params(size=len(filters)) ) return { - (result.id, result.taxonomy_name): result.names.to_dict() + (result.id, result.taxonomy_name): result.name.to_dict() for result in query.execute().hits }