forked from kn-bibs/pathways-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
346 lines (262 loc) · 11.6 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
from typing import Callable, Mapping, Sequence
from warnings import warn
import numpy as np
import pandas as pd
class Gene:
"""Stores gene's identifier and description (multiton).
At a time there can be only one gene with given identifier,
i.e. after the first initialization, all subsequent attempts
to initialize a gene with the same identifier will return
exactly the same object. This is so called multiton pattern.
Example:
>>> x = Gene('TP53')
>>> y = Gene('TP53')
>>> assert x is y # passes, there is only one gene
"""
instances = {}
def __new__(cls, name, *args, **kwargs):
if name not in cls.instances:
cls.instances[name] = super(Gene, cls).__new__(cls, *args, **kwargs)
return cls.instances[name]
def __init__(self, name, description=None):
self.name = name
self.description = description
class Sample:
"""Sample contains expression values for genes."""
def __init__(self, name, data: Mapping[Gene, float]):
self.name = name
self.data = data
@property
def genes(self):
return self.data.keys()
@classmethod
def from_names(cls, name, data: Mapping[str, float]):
"""Create a sample from a gene_name: value mapping.
Args:
name: name of sample
data: mapping (e.g. dict) where keys represent gene names
"""
return cls(name, {Gene(gene_name): value for gene_name, value in data.items()})
@classmethod
def from_array(cls, name, panda_series: pd.Series, descriptions=False):
"""Create a sample from pd.Series or equivalent.
Args:
name: name of the sample
panda_series:
series object where columns represent values of genes and
names are either gene identifiers of tuples:
``(gene_identifier, description)``
descriptions:
are descriptions present in names of the series object?
"""
gene_maker = Gene
if descriptions:
gene_maker = lambda data: Gene(*data)
return cls(name, {
gene_maker(key): value
for key, value in panda_series.to_dict().items()
})
def as_array(self):
"""
Returns: one-dimensional labeled array with Gene objects as labels
"""
return pd.Series(self.data)
def __eq__(self, other):
return self.name == other.name and self.data == other.data
def __repr__(self):
return f'<Sample "{self.name}" with {len(self.data)} genes>'
def first_line(file_object):
line = None
while not line:
line = file_object.readline()
# return to the beginning
file_object.seek(0)
return line
# TODO class variable with set of genes + method(s) for checking data integrity
class SampleCollection:
"""A collection of samples of common origin or characteristic.
An example sample_collection can be:
(Breast_cancer_sample_1, Breast_cancer_sample_2) named "Breast cancer".
The common origin/characteristics for "Breast cancer" sample_collection could be
"a breast tumour", though samples had been collected from two donors.
Another example are controls:
(Control_sample_1, Control_sample_2) named "Control".
The common characteristic for these samples is that both are controls.
"""
def __init__(self, name, samples=None):
self.samples = samples or []
self.name = name
def as_array(self):
"""
Returns: :class:`pandas.DataFrame` object with data for all samples.
"""
return {s.name: pd.DataFrame(s) for s in self.samples}
def __add__(self, other):
return SampleCollection(self.name, self.samples + other.samples)
@classmethod
def from_file(
cls, name, file_object,
columns_selector: Callable[[Sequence[int]], Sequence[int]]=None,
samples=None, delimiter: str='\t', index_col: int=0,
use_header=True, reverse_selection=False, prefix=None,
header_line=0, description_column=None
):
"""Create a sample_collection (collection of samples) from csv/tsv file.
Args:
name:
a name of the sample_collection (or group of samples) which will
identify it (like "Tumour_1" or "Control_in_20_degrees")
file_object: a file (containing gene expression)
of the following structure:
- names of samples separated by a tab in the first row,
- gene symbol/name followed by gene expression values
for every sample in remaining rows;
an additional column "description" is allowed between genes
column and sample columns, though it has to be explicitly
declared with `description_column` argument.
columns_selector:
a function which will select (and return) a subset of
provided column identifiers (do not use with `samples`)
samples:
a list of names of samples to extract from the file
(do not use with `columns_selector`)
reverse_selection:
if you want to use all columns but the selected ones
(or all samples but the selected) set this to True
delimiter: the delimiter of the columns
index_col: column to use as the gene names
use_header: does the file have a header?
prefix: prefix for custom samples naming schema
header_line: number of non-empty line with sample names
description_column:
is column with description of present in the file
(on the second position, after gene identifiers)?
"""
if file_object.tell() != 0:
warn(f'Passed file object: {file_object} was read before.')
raise Exception()
line = first_line(file_object)
header_items = [item.strip() for item in line.split('\t')]
gene_columns = [index_col]
if description_column:
description_column = 1
gene_columns.append(description_column)
else:
if any('description' == name.lower() for name in header_items):
warn(
'First line of your file contains "description" column, '
'but you did not provide "--description_column" argument.'
)
# a reasonable assumption is that the columns with samples
# start after columns with gene symbol and gene description
column_shift = max(gene_columns) + 1
if columns_selector:
# sniff how many columns do we have in the file
columns_count = line.count(delimiter)
all_sample_columns = list(range(column_shift, columns_count + column_shift))
# generate identifiers (numbers) for all columns
# and take the requested subset
columns = columns_selector(all_sample_columns)
if reverse_selection:
columns = list(columns)
columns = [c for c in all_sample_columns if c not in columns]
# https://github.com/pandas-dev/pandas/issues/9098#issuecomment-333677100
columns = gene_columns + list(columns)
else:
columns = None
if not use_header:
if samples:
raise ValueError(
'To select samples by their name, you need a file with '
'samples names in the header. If you use such file, '
'please set `use_header=True`, otherwise skip `samples` '
'in your arguments.'
)
if header_line:
warn(
'`header_line` has no effect when '
'`use_header` is set to `False`'
)
# we could leave it to pandas, but it shows an ugly,
# not very helpful message. It is better to show the
# user where exactly the problem occurs.
if samples:
available_samples = [
name
for name in header_items[column_shift:]
]
lacking_samples = set(samples) - set(available_samples)
if lacking_samples:
raise ValueError(
f'Samples {lacking_samples} are not available in {file_object.name} file.\n'
f'Following samples were found: {", ".join(available_samples)}.'
)
if index_col:
# TODO https://github.com/pandas-dev/pandas/issues/9098
warn(
'Using "samples" with "index_col" 0 may cause an '
'unexpected behaviour due to an upstream issue in '
'pandas package (pandas-dev/pandas/issues/9098) '
'for pandas in versions older than 0.21.'
)
additional_column_names = [
header_items[index]
for index in gene_columns
]
# https://github.com/pandas-dev/pandas/issues/9098#issuecomment-333677100
samples = additional_column_names + list(samples)
# just to reassure that the pointer is on the beginning
if file_object.tell() != 0:
warn('Passed file object was read before.')
if samples and columns:
warn(
'Please, provide either columns or samples, '
'not both. We will use columns this time.'
)
data = pd.read_table(
file_object,
delimiter=delimiter,
# None - do not use, 0 - use first row
header=header_line if use_header else None,
index_col=gene_columns,
usecols=columns or samples,
prefix=f'{prefix}_' if prefix else ''
)
descriptions = description_column is not None
samples = [
Sample.from_array(sample_name, sample_data, descriptions=descriptions)
for sample_name, sample_data in data.items()
]
return cls(name, samples)
@classmethod
def from_gsea_file(cls):
"""Stub: if we need to handle very specific files,
for various analysis methods, we can extend SampleCollection
with class methods like from_gsea_file."""
pass
# TODO class variable with set of genes + method(s) for checking data integrity
# TODO unify file reading with argument_parser
class Experiment:
def __init__(self, case: SampleCollection, control: SampleCollection):
self.control = control
self.case = case
def get_all(self):
return self.control + self.case
# TODO: are there many ways to compute fold-change?
def get_fold_change(self, sample_from_case, use_log=False):
assert sample_from_case in self.case.samples
# TODO: implement inline
calc_fold_change(sample_from_case, self.control, use_log=use_log)
"""
def fold_change(case, base, log2=False):
fold_changes = case.copy()
for (idx, row) in base.iterrows():
fold_changes.loc[[idx]] /= (np.mean(row) or 0.01) # TODO for now arbitrary value 0.01 when 0's are found
if log2:
fold_changes = np.log2(fold_changes) # TODO Runtime Warning when 0's are encountered
return fold_changes
"""
class Study:
def __init__(self, cases: Sequence[SampleCollection], control: SampleCollection):
for case in cases:
self.experiments = Experiment(case, control)