Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(server): add paysage
Browse files Browse the repository at this point in the history
ahonestla committed Sep 25, 2024
1 parent 3c0dd4c commit d846ac5
Showing 3 changed files with 42 additions and 18 deletions.
38 changes: 23 additions & 15 deletions project/server/main/load_paysage.py
Original file line number Diff line number Diff line change
@@ -47,7 +47,7 @@
"YNqFb": "Commerce et gestion - Etablissements d’enseignement supérieur techniques privés et consulaires autorisés à délivrer un diplôme visé par le ministre chargé de l’enseignement supérieur et/ou à conférer le grade universitaire",
"iyn79": "Opérateur du programme 150 - Formations supérieures et recherche universitaire",
"z367d": "Structure de recherche",
# "NsMkU": "Établissement d'enseignement supérieur étranger",
"NsMkU": "Établissement d'enseignement supérieur étranger",
}


@@ -127,11 +127,7 @@ def load_paysage(index_prefix: str = "matcher") -> dict:
for criterion_value in criterion_values:
if criterion_value not in es_data[criterion]:
es_data[criterion][criterion_value] = []
es_data[criterion][criterion_value].append(
{
"id": data_point["id"],
}
)
es_data[criterion][criterion_value].append({"id": data_point["id"], "categories": data_point["categories"]})

# Bulk insert data into ES
actions = []
@@ -181,19 +177,31 @@ def download_data() -> list:

# Request data
limit = 10000
filters = "&".join([f"filters[relatedObjectId]={category}" for category in list(CATEGORIES.keys())])
url = f"{PAYSAGE_API_URL}/relations?limit={limit}&filters[relationTag]=structure-categorie&{filters}"
headers = {"X-API-KEY": PAYSAGE_API_KEY}
response = requests.get(url=url, headers=headers)
data = []

if response.status_code != 200:
logger.error(f"Error {response.status_code} requesting {url}")
return None
for category in CATEGORIES:
url = f"{PAYSAGE_API_URL}/relations?limit={limit}&filters[relationTag]=structure-categorie&filters[relatedObjectId]={category}"
response = requests.get(url=url, headers=headers)

if response.status_code != 200:
logger.error(f"Error {response.status_code} requesting {url}")
continue

current_data = response.json().get("data")
# logger.debug(f"Found {len(current_data)} paysage records for category {CATEGORIES[category]}")

current_data = pd.DataFrame(current_data).drop_duplicates(subset="resourceId").to_dict(orient="records")
logger.debug(f"Found {len(current_data)} paysage records for category {CATEGORIES[category]} without duplicates")

data = response.json().get("data")
logger.debug(f"Found {len(data)} paysage records for {len(CATEGORIES)} categories")
data += current_data

data = pd.DataFrame(data).drop_duplicates(subset="resourceId").to_dict(orient="records")
df = pd.DataFrame(data)
data = (
df.groupby(by="resourceId")
.agg({k: list if k == "relatedObjectId" else "first" for k in df.columns})
.to_dict(orient="records")
)
logger.debug(f"Keep {len(data)} paysage records without duplicates")

return data
17 changes: 17 additions & 0 deletions project/server/main/matcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import requests
import itertools
from fuzzywuzzy import fuzz

@@ -9,6 +10,7 @@
from project.server.main.my_elastic import MyElastic
from project.server.main.utils import remove_stop, normalize_text
from project.server.main.load_rnsr import get_siren
from project.server.main.load_paysage import PAYSAGE_API_URL, PAYSAGE_API_KEY, CATEGORIES

logger = get_logger(__name__)

@@ -183,6 +185,21 @@ def enrich_results(self, results, method):
elt[f].append(list(hit["_source"]["query"].values())[0]["content"]["query"])
except:
pass

# enrich with paysage categories
if method == "paysage":
try:
headers = {"X-API-KEY": PAYSAGE_API_KEY}
url = f"{PAYSAGE_API_URL}/relations?limit=100&filters[relationTag]=structure-categorie&filters[resourceId]={r}"
response = requests.get(url=url, headers=headers)
data = response.json().get("data")
categories = [d.get("relatedObjectId") for d in data if d.get("relatedObjectId") in CATEGORIES]
elt["paysage_categories"] = [
{"category": category, "label": CATEGORIES[category]} for category in categories
]
except:
pass

enriched.append(elt)
return enriched

5 changes: 2 additions & 3 deletions project/server/main/tasks.py
Original file line number Diff line number Diff line change
@@ -59,7 +59,7 @@ def create_task_load(args: dict = None) -> dict:
result.update(load_grid(index_prefix=index_prefix_dated))
result.update(load_rnsr(index_prefix=index_prefix_dated))
result.update(load_ror(index_prefix=index_prefix_dated))
# result.update(load_paysage(index_prefix=index_prefix_dated))
result.update(load_paysage(index_prefix=index_prefix_dated))
elif matcher_type == 'country':
result.update(load_country(index_prefix=index_prefix_dated))
elif matcher_type == 'grid':
@@ -71,8 +71,7 @@ def create_task_load(args: dict = None) -> dict:
elif matcher_type == 'wikidata':
result.update(load_wikidata(index_prefix=index_prefix_dated))
elif matcher_type == "paysage":
result = {"Error": "Matcher Paysage is not developped yet!"}
# result.update(load_paysage(index_prefix=index_prefix_dated))
result.update(load_paysage(index_prefix=index_prefix_dated))
else:
result = {'Error': f'Matcher type {matcher_type} unknown'}
# An alias is the put on the newly created indices

0 comments on commit d846ac5

Please sign in to comment.