ajout code pour traitement noms multiples

dataesr · Dec 7, 2022 · 816519c · 816519c
1 parent fd986f6
commit 816519c
Showing 1 changed file with 46 additions and 26 deletions.
diff --git a/Etoiles_Europe.ipynb b/Etoiles_Europe.ipynb
@@ -14,8 +14,7 @@
     "pydref = Pydref()\n",
     "\n",
     "import pandas as pd, requests, re, string, time, urllib3,  os, dotenv\n",
-    "requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)\n",
-    "HEADERS = os.environ.get('HEADERS')"
+    "requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)"
    ]
   },
   {
@@ -73,7 +72,7 @@
     "etoiles = etoiles.assign(person =  etoiles[\"prenom\"] +\" \"+ etoiles[\"nom\"])\n",
     "\n",
     "# etoiles = etoiles.assign(type_de_prix = \"Etoiles de l'Europe\")\n",
-    "etoiles = etoiles.assign(edition=str(edition), type_de_prix=\"Etoiles de l'Europe\", annee=str(an), person=etoiles[\"prenom\"] +\" \"+ etoiles[\"nom\"])"
+    "etoiles = etoiles.assign(edition=str(edition), type_de_prix=\"Etoiles de l'Europe\", edition=str(an), person=etoiles[\"prenom\"] +\" \"+ etoiles[\"nom\"])"
    ]
   },
   {
@@ -138,6 +137,38 @@
     "        e['sexe'] = e['sexe'].capitalize()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project_dict = {}\n",
+    "for e in et:\n",
+    "    list_to_change=['prenom', 'nom', 'idref', 'idref_lien', 'sexe_code', 'sexe']\n",
+    "    project_id = e['project_id']\n",
+    "    if project_id not in project_dict:\n",
+    "        new_elt = {'project_id': project_id}\n",
+    "        for col in ['mention', 'edition', 'type_de_prix', 'id_struct']:\n",
+    "            new_elt[col] = e[col]\n",
+    "        for col in list_to_change:\n",
+    "            new_elt[col] = []\n",
+    "    else:\n",
+    "        new_elt = project_dict[project_id]\n",
+    "    for col in list_to_change:    \n",
+    "        new_elt[col].append(e[col])\n",
+    "    project_dict[project_id] = new_elt\n",
+    "\n",
+    "for e in project_dict:\n",
+    "    elt = project_dict[e]\n",
+    "    for f in elt:\n",
+    "        if isinstance(elt[f], list):\n",
+    "            project_dict[e][f] = ';'.join(elt[f])\n",
+    "            \n",
+    "tmp=pd.DataFrame(project_dict.values())\n",
+    "tmp=tmp.to_dict(orient=\"records\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -147,16 +178,12 @@
     "# fonction de match pour les organisations: lien avec id_struct rnsr, siren,...\n",
     "\n",
     "def struct_matcher(data):\n",
-    "    url_match = \"http://185.161.45.213/organizations/organizations/_match?id=\"\n",
-    "    url = url_match + data.get(\"id_struct\")\n",
-    "#     print(url)\n",
-    "    rq = requests.get(url=url, headers=HEADERS)\n",
-    "    print(rq.text)\n",
+    "    print(data.get('id_struct'))\n",
+    "    url = f\"http://185.161.45.213/organizations/organizations/_match?id={data.get('id_struct')}\"\n",
+    "    rq = requests.get(url=url, headers={'accept':'application/json', 'Authorization': os.environ.get('token')})\n",
     "    response = rq.json()\n",
-    "#     print(response)\n",
     "    if response.get(\"hits\") == 1:\n",
     "        result = response.get(\"data\")[0].get(\"id\")\n",
-    "#         print(result)\n",
     "        return result"
    ]
   },
@@ -167,7 +194,7 @@
    "outputs": [],
    "source": [
     "# match pour récupérer le id_scanr des structures et créer un url scanr\n",
-    "for e in et: \n",
+    "for e in tmp: \n",
     "    e[\"structure_identifiant\"] = struct_matcher(e)\n",
     "    if e[\"structure_identifiant\"]:\n",
     "        e[\"structure_lien_scanr\"] = \"https://scanr.enseignementsup-recherche.gouv.fr/structure/\" + e.get(\"structure_identifiant\")\n",
@@ -188,7 +215,7 @@
     "    url_match = \"http://185.161.45.213/organizations/scanr/\"\n",
     "    if data.get(\"structure_identifiant\"):\n",
     "        url = url_match + data.get(\"structure_identifiant\")\n",
-    "        rq = requests.get(url = url,  headers=HEADERS)\n",
+    "        rq = requests.get(url = url,  headers={'accept':'application/json', 'Authorization': os.environ.get('token')})\n",
     "        response = {\n",
     "            \"structure_libelle\": rq.json().get(\"label\", {}).get(\"default\"),\n",
     "            \"structure_sigle\": rq.json().get(\"acronym\", {}).get(\"fr\"),\n",
@@ -211,7 +238,7 @@
    "outputs": [],
    "source": [
     "# lancement fonction recup infos des structures et remplace les sauts de ligne par un espace (pas sur que ça marche !)\n",
-    "for e in et:\n",
+    "for e in tmp:\n",
     "    if struct_info(e) is not None:\n",
     "        e.update(struct_info(e))\n",
     "    if e.get(\"lat\"):\n",
@@ -229,7 +256,7 @@
     "    url_match = \"http://185.161.45.213/datastore/geocodes/\"\n",
     "    if data.get(\"commune_code\"):\n",
     "        url = url_match + data.get(\"commune_code\")\n",
-    "        rq = requests.get(url = url,  headers=HEADERS)\n",
+    "        rq = requests.get(url = url,  headers={'accept':'application/json', 'Authorization': os.environ.get('token')})\n",
     "        response = {\n",
     "            \"commune_nom\": rq.json().get(\"com_nom\"),\n",
     "            \"unite_urbaine_code\": rq.json().get(\"uu_id\"),\n",
@@ -248,7 +275,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for e in et:\n",
+    "for e in tmp:\n",
     "    if geocod(e):       \n",
     "        e.update(geocod(e))"
    ]
@@ -264,7 +291,7 @@
     "    id = str(data.get(\"project_id\"))\n",
     "    if id:\n",
     "        url = url_match + id\n",
-    "        rq = requests.get(url = url,  headers=HEADERS)\n",
+    "        rq = requests.get(url = url,  headers={'accept':'application/json', 'Authorization': os.environ.get('token')})\n",
     "        if rq.status_code == 200:\n",
     "            if rq.json().get(\"type\") == \"H2020\":\n",
     "                response = {\n",
@@ -301,7 +328,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for e in et: \n",
+    "for e in tmp: \n",
     "    e[\"projet_lien_scanr\"] = \"https://scanr.enseignementsup-recherche.gouv.fr/project/\" + str(e[\"project_id\"])\n",
     "    e[\"projet_lien_cordis\"] = \"https://cordis.europa.eu/project/id/\" + str(e[\"project_id\"])\n",
     "    \n",
@@ -330,7 +357,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "export = pd.DataFrame.from_dict(et, orient='columns')\n",
+    "export = pd.DataFrame.from_dict(tmp, orient='columns')\n",
     "export = pd.concat([full, export], ignore_index=True)"
    ]
   },
@@ -340,7 +367,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "export = export[['type_de_prix', 'edition', 'annee', 'mention', 'prenom', 'nom', 'sexe_code', 'sexe', 'idref', \n",
+    "export = export[['type_de_prix', 'edition', 'mention', 'prenom', 'nom', 'sexe_code', 'sexe', 'idref', \n",
     "            'projet_acronyme', 'projet_titre', 'appel_a_projet_code', 'thematique', \n",
     "            'programme_code', 'programme_nom', 'financement_type', 'financement_name', 'project_id', 'projet_resume',\n",
     "            'structure_identifiant', 'structure_libelle', 'structure_sigle', \n",
@@ -361,13 +388,6 @@
     "export.to_csv(PATH + \"open data/fr-esr-etoile-de-l-europe.csv\", sep=\";\", \n",
     "              encoding=\"ANSI\", na_rep=\"\", index=False, line_terminator='\\r')"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {