-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpersons.py
50 lines (39 loc) · 2.67 KB
/
persons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd, pickle, numpy as np, warnings, time, os
warnings.filterwarnings("ignore", "FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas")
pd.options.mode.copy_on_write = True
from config_path import PATH_CLEAN, PATH_API
from functions_shared import chunkify, work_csv
from step7_persons.prep_persons import persons_preparation
from step7_persons.affiliations import affiliations, persons_files_import, persons_api_simplify, persons_results_clean
CSV_DATE='20250121'
# persons_preparation(CSV_DATE)
PATH_PERSONS=f"{PATH_API}persons/"
perso_part = pd.read_pickle(f"{PATH_CLEAN}persons_participants.pkl")
perso_app = pd.read_pickle(f"{PATH_CLEAN}persons_applicants.pkl")
pp = pd.concat([perso_part.drop_duplicates(), perso_app.drop_duplicates()], ignore_index=True)
pp['contact2']=pp.contact.str.replace('-', ' ')
# requests openalex
#PREPRATION data for request openalex
lvar=['contact2','orcid_id','country_code', 'iso2','destination_code','thema_code','nationality_country_code']
mask=((pp.country_code=='FRA')|(pp.nationality_country_code=='FRA')|(pp.destination_code.isin(['COG', 'PF', 'STG', 'ADG', 'POC','SyG', 'PERA', 'SJI'])))&~((pp.contact2.isnull())&(pp.orcid_id.isnull()))
df=pp.loc[mask, lvar].sort_values(['country_code','orcid_id'], ascending=False).drop_duplicates()
print(f"size pp: {len(df)}, info sur pp with orcid: {len(df.loc[df.orcid_id.isnull()])}")
# affiliations(pp, PATH_PERSONS, CSV_DATE)
oth=persons_files_import('other', PATH_PERSONS)
em=persons_files_import('erc', PATH_PERSONS)
oth=persons_api_simplify(oth)
em=persons_api_simplify(em)
oth=persons_results_clean(oth)
em=persons_results_clean(em)
lvar=['project_id', 'generalPic', 'role', 'contact',
'title_clean', 'gender', 'email', 'tel_clean', 'domaine_email',
'orcid_id', 'birth_country_code', 'nationality_country_code',
'host_country_code', 'sending_country_code', 'iso2', 'stage', 'contact2',
'country_code', 'shift', 'call_year', 'thema_code', 'destination_code',
'entities_id', 'entities_name', 'id_secondaire', 'country_code_mapping']
mask=((pp.country_code=='FRA')|(pp.nationality_country_code=='FRA')|(pp.destination_code.isin(['COG', 'PF', 'STG', 'ADG', 'POC','SyG', 'PERA', 'SJI'])))&(~(pp.contact.isnull()&pp.orcid_id.isnull()))
df=pp.loc[mask, lvar].sort_values(['country_code','orcid_id'], ascending=False).drop_duplicates()
df=df.merge(oth, how='inner', left_on=['contact2', 'country_code'], right_on=['display_name','iso3'])
df=df[~df.astype(str).duplicated()]
# df['years']=df['years'].map(lambda liste: ';'.join(str(x) for x in liste))
df['filt']=df.apply(lambda x: x['call_year'] in x['years'], axis=1)