Skip to content

Commit

Permalink
fix: make taxonomy translations consistent
Browse files Browse the repository at this point in the history
they were random as we pick one tranlations among synonyms
  • Loading branch information
alexgarel committed Oct 15, 2024
1 parent 3e3e4d1 commit 475b87c
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 22 deletions.
21 changes: 12 additions & 9 deletions app/_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,21 +263,24 @@ def gen_taxonomy_documents(
"""
for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
for node in taxonomy.iter_nodes():
names = {}
for lang in supported_langs:
lang_names = set()
if lang in node.names:
lang_names.add(node.names[lang])
if lang in node.synonyms:
lang_names |= set(node.synonyms[lang])
names[lang] = list(lang_names)
names = {
lang: lang_names
for lang, lang_names in node.names.items()
if lang in supported_langs
}
synonyms = {
lang: lang_names
for lang, lang_names in node.synonyms.items()
if lang in supported_langs
}

yield {
"_index": next_index,
"_source": {
"id": node.id,
"taxonomy_name": taxonomy_name,
"names": names,
"name": names,
"synonyms": synonyms,
},
}

Expand Down
25 changes: 16 additions & 9 deletions app/facets.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
def _get_translations(
lang: str, items: list[tuple[str, str]], index_config: config.IndexConfig
) -> dict[tuple[str, str], str]:
"""Get translations for a list of items
:param lang: target language
:param items: list of (entry id, field_name)
:param index_config: the index configuration
:return: a dict mapping (id, field_name) to the translation
"""
# go from field_name to taxonomy
field_names = set([field_name for _, field_name in items])
field_taxonomy: dict[str, str] = {
Expand All @@ -25,7 +32,7 @@ def _get_translations(
for field_name in field_names
if index_config.fields[field_name].taxonomy_name
}
# fetch items names
# fetch items names within a single query
items_to_fetch = [
(id, field_taxonomy[field_name])
for id, field_name in items
Expand All @@ -35,24 +42,24 @@ def _get_translations(
# compute best translations
translations: dict[tuple[str, str], str] = {}
for id, field_name in items:
item_translations = None
item_translation = None
names = (
items_names.get((id, field_taxonomy[field_name]))
if field_name in field_taxonomy
else None
)
if names:
item_translations = names.get(lang, None)
item_translation = names.get(lang, None)
# fold back to main language for item
if not item_translations:
if not item_translation:
main_lang = id.split(":", 1)[0]
item_translations = names.get(main_lang, None)
item_translation = names.get(main_lang, None)
# fold back to english
if not translations:
item_translations = names.get("en", None)
if not item_translation:
item_translation = names.get("en", None)
# eventually translate
if item_translations:
translations[(id, field_name)] = item_translations[0]
if item_translation:
translations[(id, field_name)] = item_translation
return translations


Expand Down
12 changes: 11 additions & 1 deletion app/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,17 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping:
mapping.field("id", dsl_field.Keyword(required=True))
mapping.field("taxonomy_name", dsl_field.Keyword(required=True))
mapping.field(
"names",
"name",
dsl_field.Object(
required=True,
dynamic=False,
properties={
lang: dsl_field.Keyword(required=False) for lang in supported_langs
},
),
),
mapping.field(
"synonyms",
dsl_field.Object(
required=True,
dynamic=False,
Expand Down
2 changes: 1 addition & 1 deletion app/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def build_completion_query(
"""

completion_clause = {
"field": f"names.{lang}",
"field": f"synonyms.{lang}",
"size": size,
"contexts": {"taxonomy_name": taxonomy_names},
}
Expand Down
4 changes: 2 additions & 2 deletions app/taxonomy_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
def get_taxonomy_names(
items: list[tuple[str, str]],
config: IndexConfig,
) -> dict[tuple[str, str], dict[str, list[str]]]:
) -> dict[tuple[str, str], dict[str, str]]:
"""Given a set of terms in different taxonomies, return their names"""
filters = []
for id, taxonomy_name in items:
Expand All @@ -32,7 +32,7 @@ def get_taxonomy_names(
.params(size=len(filters))
)
return {
(result.id, result.taxonomy_name): result.names.to_dict()
(result.id, result.taxonomy_name): result.name.to_dict()
for result in query.execute().hits
}

Expand Down

0 comments on commit 475b87c

Please sign in to comment.