-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmapXML2XLSX.py
252 lines (194 loc) · 10.5 KB
/
mapXML2XLSX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#!/bin/python3
import datetime
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import time
from auxFunctions import *
from openpyxl import Workbook
from openpyxl import load_workbook
from datetime import date
# requirements: xlsxwriter, pandas, numpy
# IMPORTANT: THIS XML LIBRARY IS NOT SECURE AGAINST XML VULNERABILITIES
# Read more: https://docs.python.org/3/library/xml.etree.elementtree.html
XML_FILE = "data/SraExperimentPackage.xml"
TEMPLATE_PATH="input_template_empty.xlsx"
OUTPUT_PATH="data/output.xlsx"
FILL_STATIC_VALUES = True # if True, fill static values specific to this dataset
# Sheets names
ANALYSES_SHEET_NAME = "analyses"
BIOSAMPLES_SHEET_NAME = "biosamples"
COHORTS_SHEET_NAME = "cohorts"
DATASETS_SHEET_NAME = "datasets"
INDIVIDUALS_SHEET_NAME = "individuals"
RUNS_SHEET_NAME = "runs"
SHEET_NAMES = [ANALYSES_SHEET_NAME, BIOSAMPLES_SHEET_NAME, COHORTS_SHEET_NAME, DATASETS_SHEET_NAME, INDIVIDUALS_SHEET_NAME, RUNS_SHEET_NAME]
# Pass the path of the xml document
tree = ET.parse(XML_FILE)
# get the parent tag
root = tree.getroot()
# read excel file with the template + filled information
analyses = pd.read_excel(TEMPLATE_PATH, sheet_name=ANALYSES_SHEET_NAME)
biosamples = pd.read_excel(TEMPLATE_PATH, sheet_name=BIOSAMPLES_SHEET_NAME)
cohorts = pd.read_excel(TEMPLATE_PATH, sheet_name=COHORTS_SHEET_NAME)
datasets = pd.read_excel(TEMPLATE_PATH, sheet_name=DATASETS_SHEET_NAME)
individuals = pd.read_excel(TEMPLATE_PATH, sheet_name=INDIVIDUALS_SHEET_NAME)
runs = pd.read_excel(TEMPLATE_PATH, sheet_name=RUNS_SHEET_NAME)
# create a BFFSheets object
bff = BFFSheets()
# add the sheets to the BFFSheets object
bff.analyses = analyses
bff.biosamples = biosamples
bff.cohorts = cohorts
bff.datasets = datasets
bff.individuals = individuals
bff.runs = runs
# read the whole xml file, parse it and fill the BFFSheets object
for package in root:
parsePackage(bff, package)
# ---- Final data addings/adjustements ----
# mark empty cells as NaN
bff.analyses.replace("",np.NaN, inplace=True)
bff.biosamples.replace("",np.NaN, inplace=True)
bff.cohorts.replace("",np.NaN, inplace=True)
bff.datasets.replace("",np.NaN, inplace=True)
bff.individuals.replace("",np.NaN, inplace=True)
bff.runs.replace("",np.NaN, inplace=True)
# -- fill cohort generic values --
# create datafram an empty line
emptyDf = pd.DataFrame({
"id": [np.NaN],
})
# add empty row to cohort table if needed
if ( len(bff.cohorts) == 0 ):
bff.cohorts = pd.concat((bff.cohorts, emptyDf), ignore_index=True)
# add empty row to dataset table if needed
if ( len(bff.datasets) == 0 ):
bff.datasets = pd.concat((bff.datasets, emptyDf), ignore_index=True)
bff.cohorts['cohortSize'] = len(bff.individuals)
# count number of non-NaN values on the age column in individuals table
#ageAvailableCount = bff.individuals['phenotypicFeatures_onset.age.iso8601duration'].count()
#bff.cohorts['collectionEvents_eventAgeRange.availabilityCount'] = ageAvailableCount
#bff.cohorts['collectionEvents_eventAgeRange.availability'] = 'TRUE' if ageAvailableCount > 0 else 'FALSE'
# same but for sex/gender
genderAvailableCount = bff.individuals['sex.id'].count()
bff.cohorts['collectionEvents_eventGenders.availabilityCount'] = genderAvailableCount
bff.cohorts['collectionEvents_eventGenders.availability'] = 'TRUE' if genderAvailableCount > 0 else 'FALSE'
# calculate sex inclusion and distribution
sexDistribution = {}
sexInclusion = []
sexTypes = bff.individuals["sex.id"].unique()
for sexType in sexTypes:
# sexTypeDf is a datafrane with all rows of individuals of a certain sex type
sexTypeDf = bff.individuals[bff.individuals['sex.id'] == sexType]
sexCount = len(sexTypeDf)
sexLabel = sexTypeDf["sex.label"].iloc[0]
sexDistribution[sexLabel] = sexCount
sexInclusion.append({"id": sexType, "label": sexLabel})
# add sex inclusion and distribution to cohort table
bff.cohorts['inclusionCriteria.genders'] = str(sexInclusion).replace("'", '"')
bff.cohorts['collectionEvents_eventGenders.distribution'] = \
str([{"genders":sexDistribution}]).replace("'", '"')
# this function is called later so it doesn't drown the static values
def addListOfIdsToDataset():
# get list of biosample ids
biosampleIds = bff.biosamples['id'].tolist()
bff.datasets['ids.biosampleIds'] = str(biosampleIds).replace("'", '"')
# get list of individual ids
individualIds = bff.individuals['id'].tolist()
bff.datasets['ids.individualIds'] = str(individualIds).replace("'", '"')
# ---- Satic values (added manually for this dataset) ----
if FILL_STATIC_VALUES:
bff.analyses['aligner'] = 'BWA-MEM'
bff.analyses['pipelineName'] = 'TSO500 and WES'
bff.analyses['pipelineRef'] = 'https://www.nature.com/articles/s41525-021-00177-w'
bff.analyses['variantCaller'] = 'Strelka-2.9.10'
# TODO probably correct this to be something like tumor biopsy and normal biopsy
# (depending on the sample type)
bff.biosamples['sampleOriginType.id'] = "OBI:0001876"
bff.biosamples['sampleOriginType.label'] = "cell culture"
bff.cohorts['id'] = 'stage_II_and_III_colorectal_cancer'
bff.cohorts['name'] = 'Stage II and III colorectal cancer'
bff.cohorts['cohortType'] = 'study-defined'
# find use conditions here: https://ega-archive.org/about/data_use_conditions
bff.datasets['dataUseConditions.duoDataUse'] = '[' \
'{' \
'"id": "DUO:0000004",' \
'"label": "no restriction",' \
'"version": "2019-01-07"' \
'},' \
'{' \
'"id": "DUO:0000042",' \
'"label": "general research use",' \
'"version": "2019-01-07"' \
'},' \
'{' \
'"id": "DUO:0000026",' \
'"label": "user specific restriction",' \
'"version": "2019-01-07"' \
'},' \
'{' \
'"id": "DUO:0000028",' \
'"label": "institution specific restriction",' \
'"version": "2019-01-07"' \
'}' \
']'
bff.datasets['description'] = 'Gene expression and whole exome sequencing of 114 stage II/III colorectal cancer patients with poor outcome'
bff.datasets['externalUrl'] = 'https://www.nature.com/articles/s41525-021-00177-w'
# changed id to match variations one
#bff.datasets['id'] = 'PRJNA689313'
bff.datasets['id'] = "nature_colorectal_cancer_test"
bff.datasets['name'] = 'Molecular subtyping of stage II/III colorectal cancer with poor outcome'
# credits on datasets tab
bff.datasets['info.origin'] = 'https://www.ncbi.nlm.nih.gov/bioproject/PRJNA689313'
bff.datasets['info.credits.variantCallingAnalysis'] = "Daniel Sobral"
bff.datasets['info.credits.studyAuthors'] = str(['Mahdi Golkaram', 'Michael L. Salmans', 'Shannon Kaplan', 'Raakhee Vijayaraghavan', 'Marta Martins', 'Nafeesa Khan', 'Cassandra Garbutt', 'Aaron Wise', 'Joyee Yao', 'Sandra Casimiro', 'Catarina Abreu', 'Daniela Macedo', 'Ana Lúcia Costa', 'Cecília Alvim', 'André Mansinho', 'Pedro Filipe', 'Pedro Marques da Costa', 'Afonso Fernandes', 'Paula Borralho', 'Cristina Ferreira', 'Fernando Aldeia', 'João Malaquias', 'Jim Godsey', 'Alex So', 'Traci Pawlowski', 'Luis Costa', 'Shile Zhang', 'Li Liu']).replace("'",'"')
bff.datasets['info.credits.study'] = 'https://www.nature.com/articles/s41525-021-00177-w'
bff.datasets['info.credits.biomaterialProvider'] = 'Luis Costa, Centro Hospitalar Universitário Lisboa Norte, Hospital de Santa Maria, Lisbon, Portugal'
# credits on cohorts tab
bff.cohorts['info.origin'] = 'https://www.ncbi.nlm.nih.gov/bioproject/PRJNA689313'
bff.cohorts['info.credits.variantCallingAnalysis'] = "Daniel Sobral"
bff.cohorts['info.credits.studyAuthors'] = str(['Mahdi Golkaram', 'Michael L. Salmans', 'Shannon Kaplan', 'Raakhee Vijayaraghavan', 'Marta Martins', 'Nafeesa Khan', 'Cassandra Garbutt', 'Aaron Wise', 'Joyee Yao', 'Sandra Casimiro', 'Catarina Abreu', 'Daniela Macedo', 'Ana Lúcia Costa', 'Cecília Alvim', 'André Mansinho', 'Pedro Filipe', 'Pedro Marques da Costa', 'Afonso Fernandes', 'Paula Borralho', 'Cristina Ferreira', 'Fernando Aldeia', 'João Malaquias', 'Jim Godsey', 'Alex So', 'Traci Pawlowski', 'Luis Costa', 'Shile Zhang', 'Li Liu']).replace("'",'"')
bff.cohorts['info.credits.study'] = 'https://www.nature.com/articles/s41525-021-00177-w'
bff.cohorts['info.credits.biomaterialProvider'] = 'Luis Costa, Centro Hospitalar Universitário Lisboa Norte, Hospital de Santa Maria, Lisbon, Portugal'
#bff.individuals['diseases_diseaseCode.id'] = 'ICD10CM:C18.9'
bff.individuals['diseases_diseaseCode.id'] = 'DOID:9256'
#bff.individuals['diseases_diseaseCode.label'] = 'stage II/III colorectal cancer'
bff.individuals['diseases_diseaseCode.label'] = 'colorectal cancer'
# TODO: FIND OUT HOW TO ENABLE THIS
# see phenopackets page about this:
# https://phenopacket-schema.readthedocs.io/en/latest/phenotype.html#rstphenotypicfeature
#bff.individuals['phenotypicFeatures_featureType.id'] = 'HP:0003003'
#bff.individuals['phenotypicFeatures_featureType.label'] = 'Colon cancer'
# call the function to add the lists of ids to the dataset table
# NOTE: This should be dropped in the future, instead use just datasetId on every table
addListOfIdsToDataset()
# add the dataset id to all the other table
datasetId = bff.datasets['id'][0]
bff.analyses['datasetId'] = datasetId
bff.biosamples['datasetId'] = datasetId
bff.cohorts['datasetId'] = datasetId
bff.individuals['datasetId'] = datasetId
bff.runs['datasetId'] = datasetId
if True: # If True, export to an Excel file
try:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(OUTPUT_PATH, engine='xlsxwriter')
# Write each dataframe to a different worksheet.
bff.analyses.to_excel(writer, sheet_name='analyses', index=False)
bff.biosamples.to_excel(writer, sheet_name='biosamples', index=False)
bff.cohorts.to_excel(writer, sheet_name='cohorts', index=False)
bff.datasets.to_excel(writer, sheet_name='datasets', index=False)
bff.individuals.to_excel(writer, sheet_name='individuals', index=False)
bff.runs.to_excel(writer, sheet_name='runs', index=False)
# enlarge the columns to fit the data
for sheetName in SHEET_NAMES:
worksheet = writer.sheets[sheetName]
worksheet.autofit()
finally:
# Close the Pandas Excel writer and output the Excel file.
writer.close()
#print("Sleeping 10 seconds before exiting...")
#time.sleep(10)
print("* Finished OK! *")
exit(0)