Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: enhanced completion #259

Merged
merged 23 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
dbd54b2
feat: enhanced completion
alexgarel Oct 25, 2024
2b8ef15
wip: on updating suggestion web-component
alexgarel Oct 25, 2024
9d47ab6
chore: add lit-element-context library
alexgarel Oct 28, 2024
c9be731
refactor: splet interfaces + wip on suggesters
alexgarel Oct 29, 2024
73c1599
chore: wip
alexgarel Oct 29, 2024
f6d0fe7
feat: suggester component working
alexgarel Oct 29, 2024
fc06482
fix: fix bugs on facets & history
alexgarel Oct 29, 2024
084bb62
Revert "chore: add lit-element-context library"
alexgarel Oct 29, 2024
69c6600
chore: some js updates
alexgarel Oct 29, 2024
87e05f3
docs: added comment
alexgarel Oct 29, 2024
02f4890
Update app/query.py
alexgarel Oct 31, 2024
61e242a
docs: add searchalicious-suggest to web-components ref
alexgarel Oct 31, 2024
9b04e22
fix: use pydantic for taxonomy to be able to generate docs
alexgarel Oct 31, 2024
24dfd4c
fix: fix suggester langs on facets completion
alexgarel Nov 6, 2024
38e242d
feat: add input to suggestion
alexgarel Nov 6, 2024
bd63cb9
fix: no language prefix in suggestion
alexgarel Nov 6, 2024
b398d3c
feat: add suggestion name, not only matched text
alexgarel Nov 6, 2024
dd15ff8
fix: translations for facets items, also if we don't have language pr…
alexgarel Nov 6, 2024
8b34dd3
fix: fix in taxonomy_es translations
alexgarel Nov 6, 2024
eaedc2e
fix: no completion query if input is empty
alexgarel Nov 6, 2024
63afe43
ci: grouping logs
alexgarel Nov 7, 2024
5c020a6
chore: fix doc by avoiding latest sphinx
alexgarel Nov 8, 2024
9beecd1
build: revert non intentional changes to Dockerfile.sphinx
alexgarel Nov 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ build-translations:
@echo "🔎 Building translations …"
${DOCKER_COMPOSE} run --rm search_nodejs npm run translations:build

cleanup-indexes:
@echo "🔎 Cleaning indexes …"
${DOCKER_COMPOSE} run --rm api python3 -m app cleanup-indexes ${args}

generate-openapi: _ensure_network
@echo "🔎 Generating OpenAPI spec …"
${DOCKER_COMPOSE} run --rm api python3 -m app export-openapi /opt/search/data/searchalicious-openapi.yml
Expand Down
42 changes: 33 additions & 9 deletions app/_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from redis import Redis

from app._types import FetcherResult, FetcherStatus, JSONType
from app.config import Config, IndexConfig, TaxonomyConfig, settings
from app.config import Config, IndexConfig, settings
from app.indexing import (
BaseTaxonomyPreprocessor,
DocumentProcessor,
generate_index_object,
generate_taxonomy_index_object,
Expand Down Expand Up @@ -252,7 +253,7 @@ def gen_documents(


def gen_taxonomy_documents(
taxonomy_config: TaxonomyConfig, next_index: str, supported_langs: set[str]
config: IndexConfig, next_index: str, supported_langs: set[str]
):
"""Generator for taxonomy documents in Elasticsearch.

Expand All @@ -261,26 +262,49 @@ def gen_taxonomy_documents(
:param supported_langs: a set of supported languages
:yield: a dict with the document to index, compatible with ES bulk API
"""
for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
taxonomy_config = config.taxonomy
preprocessor: BaseTaxonomyPreprocessor | None = None
if taxonomy_config.preprocessor:
preprocessor_cls = load_class_object_from_string(taxonomy_config.preprocessor)
preprocessor = preprocessor_cls(config)
for taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
for node in taxonomy.iter_nodes():
if preprocessor:
result = preprocessor.preprocess(taxonomy, node)
if result.status != FetcherStatus.FOUND or result.node is None:
continue # skip this entry
node = result.node
names = {
lang: lang_names
for lang, lang_names in node.names.items()
if lang in supported_langs
}
synonyms = {
lang: lang_names
for lang, lang_names in node.synonyms.items()
synonyms: dict[str, set[str]] = {
lang: set(node.synonyms.get(lang) or [])
for lang in node.synonyms
if lang in supported_langs
}
for lang, lang_names in names.items():
if lang_names:
if not isinstance(lang_names, str):
import pdb

pdb.set_trace()
synonyms.setdefault(lang, set()).add(lang_names)

yield {
"_index": next_index,
"_source": {
"id": node.id,
"taxonomy_name": taxonomy_name,
"taxonomy_name": taxonomy.name,
"name": names,
"synonyms": synonyms,
"synonyms": {
lang: {
"input": list(lang_synonyms),
"weight": max(100 - len(node.id), 0),
}
for lang, lang_synonyms in synonyms.items()
},
},
}

Expand Down Expand Up @@ -370,7 +394,7 @@ def import_taxonomies(config: IndexConfig, next_index: str):
success, errors = bulk(
es,
gen_taxonomy_documents(
config.taxonomy, next_index, supported_langs=set(config.supported_langs)
config, next_index, supported_langs=set(config.supported_langs)
),
raise_on_error=False,
)
Expand Down
9 changes: 6 additions & 3 deletions app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,11 @@ def taxonomy_autocomplete(
description="Name(s) of the taxonomy to search in, as a comma-separated value."
),
],
lang: Annotated[
str, Query(description="Language to search in, defaults to 'en'.")
langs: Annotated[
str,
Query(
description="Languages to search in (as a comma separated list), defaults to 'en'."
),
] = "en",
size: Annotated[int, Query(description="Number of results to return.")] = 10,
fuzziness: Annotated[
Expand All @@ -167,7 +170,7 @@ def taxonomy_autocomplete(
query = build_completion_query(
q=q,
taxonomy_names=taxonomy_names_list,
lang=lang,
langs=langs.split(","),
size=size,
config=index_config,
fuzziness=fuzziness,
Expand Down
20 changes: 20 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,26 @@ class TaxonomyConfig(BaseModel):
TaxonomyIndexConfig,
Field(description=TaxonomyIndexConfig.__doc__),
]
preprocessor: (
Annotated[
str,
Field(
description=cd_(
"""The full qualified reference to the preprocessor
to use before taxonomy entry import.

This class must inherit `app.indexing.BaseTaxonomyPreprocessor`
and specialize the `preprocess` method.

This is used to adapt the taxonomy schema
or to add specific fields for example.
"""
),
examples=["app.openfoodfacts.TaxonomyPreprocessor"],
),
]
| None
) = None


class ScriptConfig(BaseModel):
Expand Down
40 changes: 37 additions & 3 deletions app/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
from app._types import FetcherResult, FetcherStatus, JSONType
from app.config import (
ANALYZER_LANG_MAPPING,
Config,
FieldConfig,
FieldType,
IndexConfig,
TaxonomyConfig,
)
from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
from app.utils import load_class_object_from_string
from app.utils.analyzers import (
get_autocomplete_analyzer,
Expand Down Expand Up @@ -104,8 +104,41 @@ def preprocess_field_value(
return input_value


class BaseTaxonomyPreprocessor(abc.ABC):
"""Base class for taxonomy entries preprocessors.

Classes referenced in index configuration `preprocess` field,
has to be derived from it.
"""

def __init__(self, config: IndexConfig) -> None:
self.config = config

@abc.abstractmethod
def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
"""Preprocess the taxonomy entry before ingestion in Elasticsearch,
and before synonyms generation

This can be used to make document schema compatible with the project
schema or to add custom fields.

:return: a TaxonomyNodeResult object:

* the status can be used to pilot wether
to index or not the entry (even delete it)
* the entry is the transformed entry
"""
pass


class BaseDocumentPreprocessor(abc.ABC):
def __init__(self, config: Config) -> None:
"""Base class for document preprocessors.

Classes referenced in index configuration `preprocess` field,
has to be derived from it.
"""

def __init__(self, config: IndexConfig) -> None:
self.config = config

@abc.abstractmethod
Expand All @@ -119,7 +152,7 @@ def preprocess(self, document: JSONType) -> FetcherResult:

* the status can be used to pilot wether
to index or not the document (even delete it)
* the document is the document transformed document
* the document is the transformed document

"""
pass
Expand Down Expand Up @@ -379,6 +412,7 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping:
"type": "category",
}
],
preserve_separators=False, # help match plurals
)
for lang in supported_langs
},
Expand Down
34 changes: 33 additions & 1 deletion app/openfoodfacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

from app._import import BaseDocumentFetcher
from app._types import FetcherResult, FetcherStatus, JSONType
from app.indexing import BaseDocumentPreprocessor
from app.indexing import BaseDocumentPreprocessor, BaseTaxonomyPreprocessor
from app.postprocessing import BaseResultProcessor
from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
from app.utils.download import http_session
from app.utils.log import get_logger

Expand Down Expand Up @@ -87,6 +88,37 @@ def generate_image_url(code: str, image_id: str) -> str:
OFF_API_URL = os.environ.get("OFF_API_URL", "https://world.openfoodfacts.org")


class TaxonomyPreprocessor(BaseTaxonomyPreprocessor):
"""Preprocessor for Open Food Facts taxonomies."""

def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
"""Preprocess a taxonomy node,

We add the main language, and we also have specificities for some taxonomies
"""
if taxonomy.name == "brands":
# brands are english only, put them in "main lang"
node.names.update(main=node.names["en"])
if node.synonyms and (synonyms_en := list(node.synonyms.get("en", []))):
node.synonyms.update(main=synonyms_en)
else:
# main language is entry id prefix + eventual xx entries
id_lang = node.id.split(":")[0]
if node_names := node.names.get(id_lang):
node.names.update(main=node_names)
node.synonyms.update(main=list(node.synonyms.get(id_lang, [])))
# add eventual xx entries as synonyms to all languages
xx_name = node.names.get("xx")
xx_names = [xx_name] if xx_name else []
xx_names += node.synonyms.get("xx", [])
if xx_names:
for lang in self.config.supported_langs:
node.names.setdefault(lang, xx_names[0])
lang_synonyms = node.synonyms.setdefault(lang, [])
lang_synonyms += xx_names
return TaxonomyNodeResult(status=FetcherStatus.FOUND, node=node)


class DocumentFetcher(BaseDocumentFetcher):
def fetch_document(self, stream_name: str, item: JSONType) -> FetcherResult:
if item.get("action") == "deleted":
Expand Down
29 changes: 20 additions & 9 deletions app/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,24 @@ def load_result_processor(config: IndexConfig) -> BaseResultProcessor | None:
def process_taxonomy_completion_response(response: Response) -> JSONType:
output = {"took": response.took, "timed_out": response.timed_out}
options = []
suggestion = response.suggest["taxonomy_suggest"][0]
for option in suggestion.options:
result = {
"id": option._source["id"],
"text": option.text,
"taxonomy_name": option._source["taxonomy_name"],
}
options.append(result)
output["options"] = options
ids = set()
for suggestion_id in dir(response.suggest):
if not suggestion_id.startswith("taxonomy_suggest_"):
continue
for suggestion in getattr(response.suggest, suggestion_id):
for option in suggestion.options:
if option._source["id"] in ids:
continue
ids.add(option._source["id"])
result = {
"id": option._source["id"],
"text": option.text,
"score": option._score,
"taxonomy_name": option._source["taxonomy_name"],
}
options.append(result)
# highest score first
output["options"] = sorted(
options, key=lambda option: option["score"], reverse=True
)
return output
37 changes: 20 additions & 17 deletions app/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def build_es_query(
def build_completion_query(
q: str,
taxonomy_names: list[str],
lang: str,
langs: list[str],
size: int,
config: IndexConfig,
fuzziness: int | None = 2,
Expand All @@ -331,28 +331,31 @@ def build_completion_query(

:param q: the user autocomplete query
:param taxonomy_names: a list of taxonomies we want to search in
:param lang: the language we want search in
:param langs: the languages we want search in
:param size: number of results to return
:param config: the index configuration to use
:param fuzziness: fuzziness parameter for completion query
:return: the built Query
"""

completion_clause = {
"field": f"synonyms.{lang}",
"size": size,
"contexts": {"taxonomy_name": taxonomy_names},
}

if fuzziness is not None:
completion_clause["fuzzy"] = {"fuzziness": fuzziness}

query = Search(index=config.taxonomy.index.name)
query = query.suggest(
"taxonomy_suggest",
q,
completion=completion_clause,
)
# import pdb;pdb.set_trace();
for lang in langs:
completion_clause = {
"field": f"synonyms.{lang}",
"size": size,
"contexts": {"taxonomy_name": taxonomy_names},
"skip_duplicates": True,
}
if fuzziness is not None:
completion_clause["fuzzy"] = {"fuzziness": fuzziness}

query = query.suggest(
f"taxonomy_suggest_{lang}",
q,
completion=completion_clause,
)
# limit returned fields
# query.source(fields=["id", "taxonomy_name"])
return query


Expand Down
Loading
Loading