-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgenerate_bigTable.py
43 lines (31 loc) · 1.65 KB
/
generate_bigTable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python3
"""
This script is used to generate metadata for EpiAtlas data
based on EpiRR registeries available from https://www.ebi.ac.uk/vg/epirr/view/all?format=json.
It generates two files:
1. EpiAtlas_EpiRR.txt (long format)
2. EpiAtlas_EpiRR_metadata_all.csv (wide format: reshaped from EpiAtlas_EpiRR.txt file)
NaN was produced by pivot_table when reshaping long to wide foramt (i.e. the value of a specific metadata item was not reported originally by the project).
Na means the value of a specific metadata item was reported by the project as "NA"
for questions please contact Abdulrahman Salhab: [email protected]
"""
import urllib.request, json
import pandas as pd
url="https://www.ebi.ac.uk/vg/epirr/view/all?format=json"
response = urllib.request.urlopen(url)
data = json.loads(response.read())
d = []
fo = open('raw/EpiAtlas_EpiRR.txt', 'w')
print('EpiRR'+'\t'+'EpiRR_status'+'\t'+'project'+'\t'+'metadata'+'\t'+'value', file=fo)
for idx in range(0,len(data)):
url=data[idx]["_links"]["self"]
response = urllib.request.urlopen(url)
url_json = json.loads(response.read())
for key,value in url_json["meta_data"].items():
print(url_json["full_accession"]+'\t'+url_json["status"]+'\t'+url_json["project"]+'\t'+key+'\t'+value, file=fo)
d.append([url_json["full_accession"],url_json["status"],url_json["project"],key,value])
fo.close()
df = pd.DataFrame(d)
df.columns = ['EpiRR', 'EpiRR_status', 'project', 'metadata', 'value']
df2 = df.pivot_table(index=["EpiRR","EpiRR_status","project"], columns='metadata', values='value', aggfunc='first')
df2.to_csv(r'raw/EpiAtlas_EpiRR_metadata_all.csv', na_rep="NaN")