-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsync_openalex_affiliations_github_issues.py
102 lines (93 loc) · 3.82 KB
/
sync_openalex_affiliations_github_issues.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
# Execution example: python3 sync_openalex_affiliations_github_issues.py
# Imports
from dotenv import load_dotenv
import os
import pandas as pd
import requests
load_dotenv()
# Config
GIT_PER_PAGE = 100
GIT_REPOSITORY_NAME = "dataesr/openalex-affiliations"
ODS_DATASET = "https://data.enseignementsup-recherche.gouv.fr/api/automation/v1.0/datasets/da_lyihp9"
ODS_FILE_ID = "re_agmowf"
OUTPUT_FILE_NAME = "github_issues.csv"
try:
GIT_TOKEN = os.environ["GIT_TOKEN"]
GIT_USERNAME = os.environ["GIT_USERNAME"]
ODS_API_KEY = os.environ["ODS_API_KEY"]
except KeyError:
print("Some config is not defined !")
# Functions
def collect_issues():
all_issues = []
for p in range(1, 50000):
issues_url = f"https://api.github.com/repos/{GIT_REPOSITORY_NAME}/issues?per_page={GIT_PER_PAGE}&page={p}&state=all"
gh_session = requests.Session()
gh_session.auth = (GIT_USERNAME, GIT_TOKEN)
issues = gh_session.get(issues_url).json()
all_issues += issues
if len(issues) < GIT_PER_PAGE:
break
return all_issues
def parse_issue(issue):
new_elt = {}
new_elt["github_issue_id"] = issue["number"]
new_elt["github_issue_link"] = f"https://github.com/{GIT_REPOSITORY_NAME}/issues/{issue['number']}"
new_elt["state"] = issue["state"]
new_elt["date_opened"] = issue["created_at"][0:10]
new_elt["date_closed"] = None if issue["closed_at"] is None else issue["closed_at"][0:10]
a = "\nraw_affiliation_name: "
b = "\nnew_rors: "
c = "\nprevious_rors: "
d = "\nworks_examples: "
e = "\ncontact: "
a_start = issue["body"].find(a) + len(a)
a_end = issue["body"].find(b)
b_start = a_end + len(b)
b_end = issue["body"].find(c)
c_start = b_end + len(c)
c_end = issue["body"].find(d)
d_start = c_end + len(d)
d_end = issue["body"].find(e)
e_start = d_end + len(e)
e_end = len(issue["body"])-1
new_elt["raw_affiliation_name"] = issue["body"][a_start:a_end].replace("\r", "")
new_rors = [r.replace("\r", "") for r in issue["body"][b_start:b_end].split(";") if r]
previous_rors = [r.replace("\r", "") for r in issue["body"][c_start:c_end].split(";") if r]
added_rors = list(set(new_rors) - set(previous_rors))
removed_rors = list(set(previous_rors) - set(new_rors))
new_elt["has_added_rors"] = 1 if len(added_rors) > 0 else 0
new_elt["has_removed_rors"] = 1 if len(removed_rors) > 0 else 0
new_elt["new_rors"] = ";".join(new_rors)
new_elt["previous_rors"] = ";".join(previous_rors)
new_elt["added_rors"] = ";".join(added_rors)
new_elt["removed_rors"] = ";".join(removed_rors)
new_elt["openalex_works_examples"] = ";".join([f"https://api.openalex.org/works/{work}" for work in issue["body"][d_start:d_end].replace("\r", "").split(";")])
if e_start > d_start:
new_elt["contact"] = issue["body"][e_start:e_end].replace("\r", "")
if "@" in new_elt["contact"]:
new_elt["contact_domain"] = new_elt["contact"].split("@")[1].strip().replace("\r", "")
return new_elt
def ods_sync():
url = f"{ODS_DATASET}/resources/files/"
headers = { "Authorization": f"apikey {ODS_API_KEY}" }
files = { "file": open(OUTPUT_FILE_NAME, "rb")}
response = requests.post(url, files=files, headers=headers)
json = {
"datasource": { "type": "uploaded_file", "file": { "uid": response.json().get("uid") } },
"title": OUTPUT_FILE_NAME,
"type": "csvfile",
}
requests.put(f"{ODS_DATASET}/resources/{ODS_FILE_ID}/", headers=headers, json=json)
requests.post(f"{ODS_DATASET}/publish/", headers=headers)
def main():
data = []
issues = collect_issues()
for issue in issues:
data.append(parse_issue(issue))
pd.DataFrame(data).to_csv(OUTPUT_FILE_NAME, index=False)
ods_sync()
# Main
if __name__ == "__main__":
main()