-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcsv2json.py
172 lines (144 loc) · 8.07 KB
/
csv2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3.10
import json
import pandas as pd
from pathlib import Path
from typing import Final
from pprint import pprint
from paper import Paper, PaperEncoder
ORAL_SESSIONS: Final[list[str]] = ["(midl board session)", "Computer-assisted diagnosis", "Graph-based methods",
"Neuroimaging", "Segmentation 1", "Segmentation 2",
"Semi-supervised/self-supervised methods", "Synthesis",
"Unsupervised/Weakly supervised methods"]
if __name__ == "__main__":
papers: list[Paper] = []
orals_list: list[str] = []
with open("pages/program.txt", 'r') as program:
cur_session: str = ""
cur_title: str
for line in program:
if line.startswith("### "):
cur_session = line[4:]
elif line.startswith("* "):
cur_title = line[2:].strip().lower()
if "Oral" in cur_session or "MIDL board" in cur_session:
orals_list.append(cur_title)
full_papers_df = pd.read_csv("full_papers.csv")
title: str
for _, full_row in full_papers_df.iterrows():
title = full_row["title"]
is_oral = title.strip().lower() in orals_list
paper = Paper(id=full_row["number"], title=title, authors=full_row["authors"],
or_id=full_row["forum"].split('=')[1], oral=str(is_oral),
short="False", abstract=full_row["abstract"], ignore_schedule=True)
if is_oral != (full_row["decision"] == 'Accept (Oral)'):
print(f"NOTE: {paper.conf_id} downgraded from Oral to poster")
papers.append(paper)
print(f">>> Loaded {len(full_papers_df)} full papers")
short_papers_df = pd.read_csv("short_papers.csv")
for _, short_row in short_papers_df.iterrows():
title = short_row["title"]
# print(short_row)
paper = Paper(id=short_row["number"], title=title, authors=short_row["authors"],
or_id=short_row["forum"].split('=')[1], oral="False",
short="True", abstract=short_row["abstract"], ignore_schedule=True)
papers.append(paper)
del title
print(f">>> Loaded {len(short_papers_df)} short papers")
paper_dict: dict[str, Paper] = {p.conf_id: p for p in papers}
json_dict: dict = json.loads(json.dumps(paper_dict, cls=PaperEncoder, indent=4, sort_keys=True))
print(">>> First conversion to json")
patch_file: str = "patch.json"
with open(patch_file, 'r') as pf:
patch: dict = json.load(pf)
print(">>> Patching")
# pprint(patch)
for p in patch:
json_dict[p] |= patch[p]
# Go back and forth, because some conf_id might have changed due to the patch, bit messy
# (for instance, orals that are downgrade to posters)
json_dict = json.loads(json.dumps({p.conf_id: p for p in [Paper(**v, ignore_schedule=True)
for v in json_dict.values()]},
cls=PaperEncoder,
indent=4,
sort_keys=True))
# Hardcoded list here of retracted papers
RETRACTED: list[str] = ['S074', 'S089', 'S128']
for id_ in RETRACTED:
del json_dict[id_]
print(f">> Retrated {len(RETRACTED)} papers ({RETRACTED})")
print(f">> Patched json with {patch_file}")
with open("melba.json", 'r') as melba:
melba_json = json.load(melba)
for mp in melba_json:
id_: int = int(mp['melba_id'].split(':')[1])
authors_: str = ", ".join(a.split('#')[0] for a in mp['authors'])
json_dict[f"M{id_:03d}"] = {'abstract': mp['abstract'],
'authors': authors_,
'award': None,
'id': id_,
'or_id': "",
'oral': "False",
'pmlr_url': "",
'schedule': "",
'short': "False",
'melba': "True",
'title': mp['title']}
del id_
del authors_
print(f">>> Loaded {len(melba_json)} for melba to journal to conf")
title_dict: dict[str, str] = {json_dict[pid]["title"].strip().lower(): pid for pid in json_dict}
with open("pages/program.txt", 'r') as program:
current_day: str = ""
current_time = ""
cur_title
for line in program:
if line.startswith("## "):
current_day = line[3:-1]
elif line.startswith("### "):
current_time = line[4:]
elif line.startswith("* "):
cur_title = line[2:].strip().lower()
json_dict[title_dict[cur_title]]["schedule"] += f"{current_day}: {current_time}"
print(">>> Adding virtual information")
midl_virtual_df = pd.read_csv("virtual_papers.csv")
for _, row in midl_virtual_df.iterrows():
# print(row["Paper #"], row["Video link"])
if str(row["Video link"]) != "nan":
json_dict[row["Paper #"]]["yt_full"] = row["Video link"]
for pdf_ in Path("static/virtual/poster").glob("*.pdf"):
json_dict[pdf_.stem]["slides"] = str(pdf_).replace("static", "")
print(">>> Parsing poster locations")
print(">> Monday")
monday_df = pd.read_csv("posterBoardNumbersMon.csv", header=None)
for _, row in monday_df.iterrows():
title_: str = row[1].strip().lower()
json_dict[title_dict[title_]]["poster_loc"] = row[0]
print(">> Tuesday")
tuesday_df = pd.read_csv("posterBoardNumbersTue.csv", header=None)
for _, row in tuesday_df.iterrows():
title_ = row[1].strip().lower()
json_dict[title_dict[title_]]["poster_loc"] = row[0]
print(">> Wednesday")
wednesday_df = pd.read_csv("posterBoardNumbersWeds.csv", header=None)
for _, row in wednesday_df.iterrows():
title_ = row[1].strip().lower()
json_dict[title_dict[title_]]["poster_loc"] = row[0]
del title_
print(">>> Regenerate IDs, final sanity checks...")
# Go back and forth, because some conf_id might have changed due to the patch, bit messy
# (for instance, orals that are downgrade to posters)
json_dict = json.loads(json.dumps({p.conf_id: p for p in [Paper(**v, ignore_schedule=True)
for v in json_dict.values()]},
cls=PaperEncoder,
indent=4,
sort_keys=True))
for id_ in json_dict:
paper = Paper(**json_dict[id_])
if paper.oral and paper.poster_loc == "Virtual only":
print(f"WARNING: {paper.conf_id} - {paper.title} has no poster location")
if paper.oral and not any("Oral" in s or "MIDL board" in s for s in paper.schedule):
print(f"WARNING: {paper.conf_id} - {paper.title} has no oral session")
print(f">>> Writing {len(papers)} to papers.json...")
with open("papers.json", 'w') as sink:
json.dump(json_dict, sink, indent=4, sort_keys=True)
# print(full_papers_csv)