Skip to content

Commit

Permalink
feat: enhanced completion (#259)
Browse files Browse the repository at this point in the history
Improvements on taxonomies completion API:

* add the option of a preprocessor for taxonomies
* add a preprocessor for Open Food Facts to handle:
  * brands taxonomy
  * add main language
  * add xx entries to all languages
* fix to have names in synonyms
* add a score to completion to first match shortest entries
* enable querying multiple languages at once
* added tests
* suggest are now declared using a component inside search-bar, enabling
more parameters to be supported in the future.
* refactor web components to put interfaces in separate files
* fixes on facets to support terms with "-" and multiple terms

TODO:
- [x] modifiy search-bar web component to use langs instead of lang.

---------

Co-authored-by: Stéphane Gigandet <[email protected]>
  • Loading branch information
alexgarel and stephanegigandet authored Nov 12, 2024
1 parent df65d3a commit 5e05aef
Show file tree
Hide file tree
Showing 47 changed files with 1,263 additions and 454 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ build-translations:
@echo "🔎 Building translations …"
${DOCKER_COMPOSE} run --rm search_nodejs npm run translations:build

cleanup-indexes:
@echo "🔎 Cleaning indexes …"
${DOCKER_COMPOSE} run --rm api python3 -m app cleanup-indexes ${args}

generate-openapi: _ensure_network
@echo "🔎 Generating OpenAPI spec …"
${DOCKER_COMPOSE} run --rm api python3 -m app export-openapi /opt/search/data/searchalicious-openapi.yml
Expand Down
52 changes: 39 additions & 13 deletions app/_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from redis import Redis

from app._types import FetcherResult, FetcherStatus, JSONType
from app.config import Config, IndexConfig, TaxonomyConfig, settings
from app.config import Config, IndexConfig, settings
from app.indexing import (
BaseTaxonomyPreprocessor,
DocumentProcessor,
generate_index_object,
generate_taxonomy_index_object,
Expand Down Expand Up @@ -252,7 +253,7 @@ def gen_documents(


def gen_taxonomy_documents(
taxonomy_config: TaxonomyConfig, next_index: str, supported_langs: set[str]
config: IndexConfig, next_index: str, supported_langs: set[str]
):
"""Generator for taxonomy documents in Elasticsearch.
Expand All @@ -261,26 +262,51 @@ def gen_taxonomy_documents(
:param supported_langs: a set of supported languages
:yield: a dict with the document to index, compatible with ES bulk API
"""
for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
taxonomy_config = config.taxonomy
preprocessor: BaseTaxonomyPreprocessor | None = None
if taxonomy_config.preprocessor:
preprocessor_cls = load_class_object_from_string(taxonomy_config.preprocessor)
preprocessor = preprocessor_cls(config)
for taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
for node in taxonomy.iter_nodes():
if preprocessor:
result = preprocessor.preprocess(taxonomy, node)
if result.status != FetcherStatus.FOUND or result.node is None:
continue # skip this entry
node = result.node
names = {
lang: lang_names
for lang, lang_names in node.names.items()
if lang in supported_langs
lang: lang_name
for lang, lang_name in node.names.items()
if lang in supported_langs and lang_name
}
synonyms = {
lang: lang_names
for lang, lang_names in node.synonyms.items()
synonyms: dict[str, set[str]] = {
lang: set(node.synonyms.get(lang) or [])
for lang in node.synonyms
if lang in supported_langs
}

for lang, lang_name in names.items():
if lang_name:
synonyms.setdefault(lang, set()).add(lang_name)
# put the name as first synonym and order by length
synonyms_list: dict[str, list[str]] = {}
for lang, lang_synonyms in synonyms.items():
filtered_synonyms = filter(lambda s: s, lang_synonyms)
synonyms_list[lang] = sorted(
filtered_synonyms, key=lambda s: 0 if s == names[lang] else len(s)
)
yield {
"_index": next_index,
"_source": {
"id": node.id,
"taxonomy_name": taxonomy_name,
"taxonomy_name": taxonomy.name,
"name": names,
"synonyms": synonyms,
"synonyms": {
lang: {
"input": lang_synonyms,
"weight": max(100 - len(node.id), 0),
}
for lang, lang_synonyms in synonyms_list.items()
},
},
}

Expand Down Expand Up @@ -370,7 +396,7 @@ def import_taxonomies(config: IndexConfig, next_index: str):
success, errors = bulk(
es,
gen_taxonomy_documents(
config.taxonomy, next_index, supported_langs=set(config.supported_langs)
config, next_index, supported_langs=set(config.supported_langs)
),
raise_on_error=False,
)
Expand Down
11 changes: 7 additions & 4 deletions app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,11 @@ def taxonomy_autocomplete(
description="Name(s) of the taxonomy to search in, as a comma-separated value."
),
],
lang: Annotated[
str, Query(description="Language to search in, defaults to 'en'.")
langs: Annotated[
str,
Query(
description="Languages to search in (as a comma separated list), defaults to 'en'."
),
] = "en",
size: Annotated[int, Query(description="Number of results to return.")] = 10,
fuzziness: Annotated[
Expand All @@ -167,7 +170,7 @@ def taxonomy_autocomplete(
query = build_completion_query(
q=q,
taxonomy_names=taxonomy_names_list,
lang=lang,
langs=langs.split(","),
size=size,
config=index_config,
fuzziness=fuzziness,
Expand All @@ -180,7 +183,7 @@ def taxonomy_autocomplete(
detail="taxonomy index not found, taxonomies need to be imported first",
)

response = process_taxonomy_completion_response(es_response)
response = process_taxonomy_completion_response(es_response, q, langs.split(","))

return {
**response,
Expand Down
20 changes: 20 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,26 @@ class TaxonomyConfig(BaseModel):
TaxonomyIndexConfig,
Field(description=TaxonomyIndexConfig.__doc__),
]
preprocessor: (
Annotated[
str,
Field(
description=cd_(
"""The full qualified reference to the preprocessor
to use before taxonomy entry import.
This class must inherit `app.indexing.BaseTaxonomyPreprocessor`
and specialize the `preprocess` method.
This is used to adapt the taxonomy schema
or to add specific fields for example.
"""
),
examples=["app.openfoodfacts.TaxonomyPreprocessor"],
),
]
| None
) = None


class ScriptConfig(BaseModel):
Expand Down
40 changes: 37 additions & 3 deletions app/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
from app._types import FetcherResult, FetcherStatus, JSONType
from app.config import (
ANALYZER_LANG_MAPPING,
Config,
FieldConfig,
FieldType,
IndexConfig,
TaxonomyConfig,
)
from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
from app.utils import load_class_object_from_string
from app.utils.analyzers import (
get_autocomplete_analyzer,
Expand Down Expand Up @@ -104,8 +104,41 @@ def preprocess_field_value(
return input_value


class BaseTaxonomyPreprocessor(abc.ABC):
"""Base class for taxonomy entries preprocessors.
Classes referenced in index configuration `preprocess` field,
has to be derived from it.
"""

def __init__(self, config: IndexConfig) -> None:
self.config = config

@abc.abstractmethod
def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
"""Preprocess the taxonomy entry before ingestion in Elasticsearch,
and before synonyms generation
This can be used to make document schema compatible with the project
schema or to add custom fields.
:return: a TaxonomyNodeResult object:
* the status can be used to pilot wether
to index or not the entry (even delete it)
* the entry is the transformed entry
"""
pass


class BaseDocumentPreprocessor(abc.ABC):
def __init__(self, config: Config) -> None:
"""Base class for document preprocessors.
Classes referenced in index configuration `preprocess` field,
has to be derived from it.
"""

def __init__(self, config: IndexConfig) -> None:
self.config = config

@abc.abstractmethod
Expand All @@ -119,7 +152,7 @@ def preprocess(self, document: JSONType) -> FetcherResult:
* the status can be used to pilot wether
to index or not the document (even delete it)
* the document is the document transformed document
* the document is the transformed document
"""
pass
Expand Down Expand Up @@ -379,6 +412,7 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping:
"type": "category",
}
],
preserve_separators=False, # help match plurals
)
for lang in supported_langs
},
Expand Down
34 changes: 33 additions & 1 deletion app/openfoodfacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

from app._import import BaseDocumentFetcher
from app._types import FetcherResult, FetcherStatus, JSONType
from app.indexing import BaseDocumentPreprocessor
from app.indexing import BaseDocumentPreprocessor, BaseTaxonomyPreprocessor
from app.postprocessing import BaseResultProcessor
from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
from app.utils.download import http_session
from app.utils.log import get_logger

Expand Down Expand Up @@ -87,6 +88,37 @@ def generate_image_url(code: str, image_id: str) -> str:
OFF_API_URL = os.environ.get("OFF_API_URL", "https://world.openfoodfacts.org")


class TaxonomyPreprocessor(BaseTaxonomyPreprocessor):
"""Preprocessor for Open Food Facts taxonomies."""

def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
"""Preprocess a taxonomy node,
We add the main language, and we also have specificities for some taxonomies
"""
if taxonomy.name == "brands":
# brands are english only, put them in "main lang"
node.names.update(main=node.names["en"])
if node.synonyms and (synonyms_en := list(node.synonyms.get("en", []))):
node.synonyms.update(main=synonyms_en)
else:
# main language is entry id prefix + eventual xx entries
id_lang = node.id.split(":")[0]
if node_names := node.names.get(id_lang):
node.names.update(main=node_names)
node.synonyms.update(main=list(node.synonyms.get(id_lang, [])))
# add eventual xx entries as synonyms to all languages
xx_name = node.names.get("xx")
xx_names = [xx_name] if xx_name else []
xx_names += node.synonyms.get("xx", [])
if xx_names:
for lang in self.config.supported_langs:
node.names.setdefault(lang, xx_names[0])
lang_synonyms = node.synonyms.setdefault(lang, [])
lang_synonyms += xx_names
return TaxonomyNodeResult(status=FetcherStatus.FOUND, node=node)


class DocumentFetcher(BaseDocumentFetcher):
def fetch_document(self, stream_name: str, item: JSONType) -> FetcherResult:
if item.get("action") == "deleted":
Expand Down
36 changes: 26 additions & 10 deletions app/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,32 @@ def load_result_processor(config: IndexConfig) -> BaseResultProcessor | None:
return result_processor_cls(config)


def process_taxonomy_completion_response(response: Response) -> JSONType:
def process_taxonomy_completion_response(
response: Response, input: str, langs: list[str]
) -> JSONType:
output = {"took": response.took, "timed_out": response.timed_out}
options = []
suggestion = response.suggest["taxonomy_suggest"][0]
for option in suggestion.options:
result = {
"id": option._source["id"],
"text": option.text,
"taxonomy_name": option._source["taxonomy_name"],
}
options.append(result)
output["options"] = options
ids = set()
lang = langs[0]
for suggestion_id in dir(response.suggest):
if not suggestion_id.startswith("taxonomy_suggest_"):
continue
for suggestion in getattr(response.suggest, suggestion_id):
for option in suggestion.options:
if option._source["id"] in ids:
continue
ids.add(option._source["id"])
result = {
"id": option._source["id"],
"text": option.text,
"name": getattr(option._source["name"], lang, ""),
"score": option._score,
"input": input,
"taxonomy_name": option._source["taxonomy_name"],
}
options.append(result)
# highest score first
output["options"] = sorted(
options, key=lambda option: option["score"], reverse=True
)
return output
37 changes: 20 additions & 17 deletions app/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def build_es_query(
def build_completion_query(
q: str,
taxonomy_names: list[str],
lang: str,
langs: list[str],
size: int,
config: IndexConfig,
fuzziness: int | None = 2,
Expand All @@ -331,28 +331,31 @@ def build_completion_query(
:param q: the user autocomplete query
:param taxonomy_names: a list of taxonomies we want to search in
:param lang: the language we want search in
:param langs: the languages we want search in
:param size: number of results to return
:param config: the index configuration to use
:param fuzziness: fuzziness parameter for completion query
:return: the built Query
"""

completion_clause = {
"field": f"synonyms.{lang}",
"size": size,
"contexts": {"taxonomy_name": taxonomy_names},
}

if fuzziness is not None:
completion_clause["fuzzy"] = {"fuzziness": fuzziness}

query = Search(index=config.taxonomy.index.name)
query = query.suggest(
"taxonomy_suggest",
q,
completion=completion_clause,
)
# import pdb;pdb.set_trace();
for lang in langs:
completion_clause = {
"field": f"synonyms.{lang}",
"size": size,
"contexts": {"taxonomy_name": taxonomy_names},
"skip_duplicates": True,
}
if fuzziness is not None:
completion_clause["fuzzy"] = {"fuzziness": fuzziness}

query = query.suggest(
f"taxonomy_suggest_{lang}",
q,
completion=completion_clause,
)
# limit returned fields
query.source(fields=["id", "taxonomy_name", "name"])
return query


Expand Down
Loading

0 comments on commit 5e05aef

Please sign in to comment.