-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauxFunctions.py
220 lines (176 loc) · 8.3 KB
/
auxFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Description: Auxiliary functions
import pandas as pd
from pandas import DataFrame
class BFFSheets:
analyses: DataFrame = pd.DataFrame()
biosamples = pd.DataFrame()
cohorts : DataFrame = pd.DataFrame()
datasets : DataFrame = pd.DataFrame()
individuals : DataFrame = pd.DataFrame()
runs : DataFrame = pd.DataFrame()
def parsePackage(bffSheets:BFFSheets, package) -> int:
# --- Parse XML values in this package ---
exp = package.find('EXPERIMENT')
individualId = exp.attrib['alias'].split("_")[0]
libStrategy = exp.find("DESIGN").find("LIBRARY_DESCRIPTOR").find("LIBRARY_STRATEGY").text
libSourceText = exp.find("DESIGN").find("LIBRARY_DESCRIPTOR").find("LIBRARY_SOURCE").text
libSourceId, libSourceLabel = getOntologyCodeFromLabelLibSource(libSourceText)
libSelection = libSource = exp.find("DESIGN").find("LIBRARY_DESCRIPTOR").find("LIBRARY_SELECTION").text
# this should contain at most 1 value, but in case it has more the script adds all of them
libLayout = ""
for layout in exp.find("DESIGN").find("LIBRARY_DESCRIPTOR").find("LIBRARY_LAYOUT"):
libLayout += layout.tag + ","
libLayout = libLayout.strip(",") # remove last separator
platform = exp.find("PLATFORM")[0].tag
platformLabel = exp.find("PLATFORM")[0].find("INSTRUMENT_MODEL").text
sample = package.find('SAMPLE')
for attribute in sample.find("SAMPLE_ATTRIBUTES"):
attribTag = attribute.find("TAG").text
attribVal = attribute.find("VALUE").text
if attribTag == "age":
age = attribVal
elif attribTag == "sex":
sex = attribVal
sexId, sexLabel = getOntologyCodeFromLabelSex(sex)
elif attribTag == "disease_stage":
stage = attribVal
elif attribTag == "disease":
sampleDisease = attribVal
runSet = package.find("RUN_SET")
if len(runSet) != 1:
print(f"ERROR: This script was designed to handle run_sets with a single element, found a run_set with {len(runSet)} elements")
return -1
run = runSet[0]
runId = run.find("IDENTIFIERS").find("PRIMARY_ID").text
# ATENTION: We use this way to get biosampleId to match the VCF values presented in genomicVariations
biosampleId = run.find("IDENTIFIERS").find("SUBMITTER_ID").text.split("_L")[0].replace("-","_")
#biosampleId = sample.attrib['alias']
runDate = -1
for sraFile in run.find("SRAFiles").findall("SRAFile"):
if sraFile.attrib['filename'] == runId:
runDate = sraFile.attrib['date'].split(" ")[0] # remove the hour part of the date
if runDate == -1:
print("ERROR: Couldn't find runDate")
return -1
# --- Fill database with parsed values ---
# handle data not found in the XML
analysesId = biosampleId
analysesDate = runDate
# create dataframe with new analyses data
analysesDf = pd.DataFrame({
"id": [analysesId],
"biosampleId": [biosampleId],
"individualId": [individualId],
"runId": [runId],
"analysisDate": [analysesDate],
})
# add data to table
bffSheets.analyses = pd.concat((bffSheets.analyses, analysesDf), ignore_index=True)
# check type of biosample
if sampleDisease == "normal":
biosampleStatusId = "EFO:0009654"
biosampleStatusLabel = "reference sample"
else:
biosampleStatusId = "EFO:0009655"
biosampleStatusLabel = "abnormal sample"
# create dataframe with new biosamples data
biosamplesDf = pd.DataFrame({
"id": [biosampleId],
"individualId": [individualId],
"biosampleStatus.id": [biosampleStatusId],
"biosampleStatus.label": [biosampleStatusLabel],
#"phenotypicFeatures_onset.age.iso8601duration": [getISO8601DurationFromAge(age)],
#"measurements_observationMoment.age.iso8601duration": [getISO8601DurationFromAge(age)],
"tumorGrade.label": [mapTumorGrade2Ontology(stage)[1]],
"tumorGrade.id": [mapTumorGrade2Ontology(stage)[0]],
})
# add data to table
bffSheets.biosamples = pd.concat((bffSheets.biosamples, biosamplesDf), ignore_index=True)
# add data to individuals table
# check if the individual is already in the table
if individualId not in bffSheets.individuals["id"].values:
# the individual is not in the table, add it
individualsDf = pd.DataFrame({
"id": [individualId],
"sex.label": [sexLabel],
"sex.id": [sexId],
# TODO FIND OUT HOW TO ENABLE THIS
#"phenotypicFeatures_onset.age.iso8601duration": [getISO8601DurationFromAge(age)],
# TODO CHECK THAT THIS IS CORRECT
#"phenotypicFeatures_onset": ["Age"], # adding column to select the type of age param
})
bffSheets.individuals = pd.concat((bffSheets.individuals, individualsDf), ignore_index=True)
# add data to runs table
runsDf = pd.DataFrame({
"id": [runId],
"biosampleId": [biosampleId],
"individualId": [individualId],
"libraryLayout": [libLayout],
"librarySelection": [libSelection],
"librarySource.label": [libSourceLabel],
"librarySource.id": [libSourceId],
"libraryStrategy": [libStrategy],
"platform": [platform],
"platformModel.label": [platformLabel],
"platformModel.id": [ getOntologyCodeFromLabelPlatform(platformLabel) ],
"runDate": [runDate],
})
# add data to table
bffSheets.runs = pd.concat((bffSheets.runs, runsDf), ignore_index=True)
return 0
def addRow(df:DataFrame, row):
df.append(row, ignore_index=True)
# "IIIA" -> ("NCIT:C28076", "Grade 3a")
def mapTumorGrade2Ontology(gradeStr) -> tuple[str, str]:
mapDict = {}
mapDict["not applicable"] = ("NCIT:C48660", "Not Applicable")
mapDict["I"] = ("NCIT:C28077", "Grade 1")
mapDict["II"] = ("NCIT:C28078", "Grade 2")
mapDict["IIA"] = mapDict["II"] #("", "Grade 2a")
mapDict["IIB"] = mapDict["II"] #("", "Grade 2b")
mapDict["IIC"] = mapDict["II"] #("", "Grade 2c")
mapDict["III"] = ("NCIT:C28079", "Grade 3")
mapDict["IIIA"] = ("NCIT:C28076", "Grade 3a")
mapDict["IIIB"] = ("NCIT:C28081", "Grade 3b")
mapDict["IIIb"] = mapDict["IIIB"] # alias found in the XML
mapDict["IIIC"] = mapDict["III"] #("", "Grade 3c")
if gradeStr not in mapDict:
print(f"Warning: Unknown library source: {gradeStr}. No library source ontology code will be defined.")
return ("",gradeStr)
return mapDict[gradeStr]
def getOntologyCodeFromLabelPlatform(labelPlatform:str) -> str:
label = labelPlatform.strip().lower()
# build dictionary to map label to ontology code
ontoDict = {} # key: label, value: ontology code
ontoDict["illumina novaseq 6000"] = "OBI:0002630"
if label not in ontoDict:
return ""
return ontoDict[label]
def getISO8601DurationFromAge(ageStr:str) -> str:
return f"P{ageStr}Y"
def getOntologyCodeFromLabelLibSource(labelLibSource:str) -> tuple[str,str]:
# all terms in:
# www.ebi.ac.uk/ols/ontologies/genepio/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGENEPIO_0001965&lang=en&viewMode=All&siblings=false#
label = labelLibSource.strip().lower()
if label in ("genomic", "genomic source"):
libSourceLabel = "genomic source"
libSourceId = "GENEPIO:0001966"
elif label in ("transcriptomic", "transcriptomic source"):
libSourceLabel = "transcriptomic source"
libSourceId = "GENEPIO:0001971"
else:
libSourceLabel = label
libSourceId = ""
print(f"Warning: Unknown library source: {label}. No library source ontology code will be defined.")
return (libSourceId, libSourceLabel)
def getOntologyCodeFromLabelSex(labelSex:str) -> tuple[str,str]:
label = labelSex.strip().lower()
ontoDict = {}
ontoDict['male'] = ('NCIT:C20197',"male")
ontoDict['female'] = ('NCIT:C16576', "female")
ontoDict['not collected'] = ("NCIT:C17998", "unknown")
ontoDict['unknown'] = ontoDict['not collected']
if label not in ontoDict:
print(f"Warning: Unknown sex/gender: {label}. No sex/gender ontology code will be defined.")
return ("", label)
return (ontoDict[label][0], ontoDict[label][1])