-
Notifications
You must be signed in to change notification settings - Fork 2
/
correlation_report.py
415 lines (368 loc) · 16.4 KB
/
correlation_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
"""
The MIT License (MIT)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import requests
from requests.exceptions import HTTPError
import pandas as pd
import numpy as np
import scipy.stats
import json
import logging
from logging.config import dictConfig
import time
import threading
import sys
ENCODE_BASE_URL = 'https://www.encodeproject.org'
MAD_SEARCH_URL = ('/search/?type=MadQualityMetric'
'&status=released'
'&assay_term_name=RNA-seq'
'&assay_term_name=shRNA+knockdown+followed+by+RNA-seq'
'&assay_term_name=CRISPRi+followed+by+RNA-seq'
'&assay_term_name=CRISPR+genome+editing+followed+by+RNA-seq'
'&assay_term_name=siRNA+knockdown+followed+by+RNA-seq'
'&assay_term_name=single+cell+isolation+followed+by+RNA-seq'
'&assay_term_name=PAS-seq'
'&frame=embedded'
'&format=json'
'&limit=all')
EXPERIMENT_SEARCH_URL = (
'/search/?status=released'
'&type=Experiment'
'&assay_term_name=RNA-seq'
'&assay_term_name=shRNA+knockdown+followed+by+RNA-seq'
'&assay_term_name=CRISPRi+followed+by+RNA-seq'
'&assay_term_name=CRISPR+genome+editing+followed+by+RNA-seq'
'&assay_term_name=siRNA+knockdown+followed+by+RNA-seq'
'&assay_term_name=single+cell+isolation+followed+by+RNA-seq'
'&assay_term_name=PAS-seq'
'&frame=embedded'
'&format=json'
'&limit=all')
def get_metadata_object(query_url,
error_message,
logger,
raise_exception=False):
"""Gets data from query.
Makes a GET request to queryurl.
Args:
query_url: url that the GET is sent to. json response
expected
error_message: if HTTPError occurs in request, message logged.
logger: logger to log into
raise_exception: boolean. Raise HTTPError after logging.
defaults to False.
Returns:
A dict with the response content in it.
Raises:
HTTPError
"""
r = requests.get(query_url)
try:
r.raise_for_status()
except HTTPError as e:
logger.exception(error_message)
if raise_exception:
raise (e)
else:
return r.json()
def build_file_to_experiment_data_mapping(experiment_data_dict):
"""Build a file-to-experiment lookup mapping.
Args:
experiment_data_dict: dict that contains experiment metadata.
Returns:
Lookup mapping with keys of format '/files/<file_accession>',
and a dict
{
'experiment_accession' : <experiment accession>,
'replication_type' : <replication type>,
'biosample_type' : <biosample type>,
'assembly' : <assembly>
}
as values. If the file does not have the attribute assembly,
'Not applicable' is output.
"""
file_to_experiment_mapping = dict()
for experiment in experiment_data_dict['@graph']:
experiment_accession = experiment['accession']
replication_type = experiment['replication_type']
biosample_type = experiment['biosample_type']
for file in experiment['files']:
assembly = file.get('assembly', 'Not applicable')
file_to_experiment_mapping[file['@id']] = {
'experiment_accession': experiment_accession,
'replication_type': replication_type,
'biosample_type': biosample_type,
'assembly': assembly
}
return file_to_experiment_mapping
def get_min_quantile_value(series1, series2, quant):
"""Calculate minimum of quantile thresholds.
Given two pandas Series, and a quantile, calculate threshold for both Series, and return the
smaller one.
Args:
dataframe1: Pandas Series
dataframe2: Pandas Series
quant: float percentile. For top 1% enter 0.99 etc.
Returns:
threshold: float
"""
return min(series1.quantile(q=quant), series2.quantile(q=quant))
def get_pearson(arr1, arr2):
"""Calculate pearson correlation
"""
return scipy.stats.pearsonr(arr1, arr2)[0]
def get_spearman(arr1, arr2):
"""Calculate spearman correlation
"""
return scipy.stats.spearmanr(arr1, arr2)[0]
def get_log2_mean(arr1, arr2):
"""Calculate log2 mean
"""
return (np.log2(arr1) + np.log2(arr2)) / 2.0
def get_tsv(url, retries_loading_tsv):
"""Load tsv into DataFrame.
Args:
url: string location of the tsv file
retries_loading_tsv: how many times to retry before giving up and
returning None
Returns:
pandas.DataFrame if all went well, None if retries_loading_tsv
are exhausted.
"""
for attempt in range(retries_loading_tsv):
try:
tsv = pd.read_csv(url, sep='\t')
except:
logger.exception('Attempt {} failed when loading first tsv in {}.'.
format(attempt, url))
else:
break
else:
# all attempts failed
logger.warning('Failure in loading tsv {}. Skipping.'.format(url))
return None
return tsv
def build_record_of_correlation_metrics_from_madqc_obj(
madqc_obj,
file_to_experiment_mapping,
base_url='https://www.encodeproject.org',
retries_loading_tsv=5):
"""Take one madqc dict, and calculate correlation metric dict.
Args:
madqc_obj: dict containing a madQC object,
file_to_experiment_mapping: dict built by
build_file_to_experiment_data_mapping.,
base_url: base url
retries_loading_tsv: max times to retry loading tsvs before
giving up and returning None
Returns:
Dict with following structure:
{
'mad_id' : id of the madQC object
'quality_metric_of': [file1, file2],
'current_pearson' : pearson correlation from madQC,
'current_spearman' : spearman correlation from madQC,
'FPKM_gt_1_pearson' : pearson correlation between
file1 and file2 FPKMs where entries for both files are greater
than one. Log2 transform is applied before calculation,
'FPKM_gt_1_spearman' : as above, but spearman,
'FPKM_log2_mean_gt_0_pearson' : Log2 transformed
pearson correlation. Mean of Log2(FPKM) less or equal than zero
values are omitted,
'FPKM_log2_mean_gt_0_spearman' : as above but spearman,
'FPKM_log2_mean_gt_0_pearson_01_pct' : As above, but additionally
top 0.1 percentile of FPKMs are omitted (threshold is
calculated for both files and smaller is applied to both),
'FPKM_log2_mean_gt_0_spearman_01_pct' : As above,
'FPKM_log2_mean_gt_0_pearson_1_pct' : As above, but top 1
percentile removed,
'FPKM_log2_mean_gt_0_spearman_1_pct' : As above, but spearman,
'FPKM_log2_mean_gt_0_pearson_10_pct' : As above, but top 10
percentile removed,
'FPKM_log2_mean_gt_0_spearman_1_pct' : As above, but spearman
'assembly' : Assembly used in creating quants, in case these are
not equal for both files, raise AssertionError,
'experiment_accession' : accession of the experiment files are
from. In case of mismatch between files, raise
AssertionError.
'replication_type' : isogenic or anisogenic,
'biosample_type' : Biosample type of experiment,
'number_of_genes_detected' : [# of genes TPM > 1 in file1,
# of genes TPM > 1 in file2]
'file1_log2_mean_gt_0' : how many genes from file 1 make the cut
'file2_log2_mean_gt_0' : how many genes from file 2 make the cut
}
Raises:
AssertionError if assemblies of files are not equal or files are not
from same experiment.
"""
# first check things that result in error, to fail as fast as possible.
try:
file1_meta = file_to_experiment_mapping[madqc_obj['quality_metric_of'][
0]]
file2_meta = file_to_experiment_mapping[madqc_obj['quality_metric_of'][
1]]
except KeyError:
logger.exception('{} and/or {} not in lookup mapping'.format(
madqc_obj['quality_metric_of'][0], madqc_obj['quality_metric_of'][
1]))
return None
assert file1_meta['experiment_accession'] == file2_meta[
'experiment_accession'], 'Mismatch in experiment accession.'
assert file1_meta['assembly'] == file2_meta[
'assembly'], 'Mismatch in assembly.'
correlation_record = dict()
correlation_record['mad_id'] = madqc_obj['@id']
correlation_record['quality_metric_of'] = madqc_obj['quality_metric_of']
correlation_record['current_pearson'] = madqc_obj['Pearson correlation']
correlation_record['current_spearman'] = madqc_obj['Spearman correlation']
correlation_record['assembly'] = file1_meta['assembly']
correlation_record['experiment_accession'] = file1_meta[
'experiment_accession']
correlation_record['replication_type'] = file1_meta['replication_type']
correlation_record['biosample_type'] = file1_meta['biosample_type']
# that is all that can be done 'offline', get the actual files now.
quants1 = get_tsv(
base_url + madqc_obj['quality_metric_of'][0] + '@@download',
retries_loading_tsv)
quants2 = get_tsv(
base_url + madqc_obj['quality_metric_of'][1] + '@@download',
retries_loading_tsv)
if (quants1 is None) or (quants2 is None):
logger.warning('Getting tsvs for {} failed. Will be omitted.'.format(
madqc_obj['@id']))
return None
# calculate filters for various conditions
try:
fpkm1 = quants1['FPKM']
fpkm2 = quants2['FPKM']
except KeyError:
logger.exception('{} and {} lack FPKM'.format(madqc_obj[
'quality_metric_of'][0], madqc_obj['quality_metric_of'][1]))
return None
correlation_record['number_of_genes_detected'] = [
sum(quants1['TPM'] > 1),
sum(quants2['TPM'] > 1)
]
del quants1
del quants2
neitherzero = (fpkm1 != 0) & (fpkm2 != 0)
fpkm_gt_1 = (fpkm1 > 1) & (fpkm2 > 1)
correlation_record['FPKM_gt_1_pearson'] = get_pearson(
np.log2(fpkm1[fpkm_gt_1]), np.log2(fpkm2[fpkm_gt_1]))
correlation_record['FPKM_gt_1_spearman'] = get_spearman(
np.log2(fpkm1[fpkm_gt_1]), np.log2(fpkm2[fpkm_gt_1]))
fpkm1_log2 = np.log2(fpkm1[neitherzero])
fpkm2_log2 = np.log2(fpkm2[neitherzero])
log2_mean = (fpkm1_log2 + fpkm2_log2) / 2.0
# check with Seth if he wants AND or OR for quantiles
quantile_01 = get_min_quantile_value(fpkm1_log2, fpkm2_log2, 0.999)
quantile_1 = get_min_quantile_value(fpkm1_log2, fpkm2_log2, 0.99)
quantile_10 = get_min_quantile_value(fpkm1_log2, fpkm2_log2, 0.9)
quantile_01_condition = (fpkm1_log2 < quantile_01) | (fpkm2_log2 <
quantile_01)
quantile_1_condition = (fpkm1_log2 < quantile_1) | (fpkm2_log2 <
quantile_1)
quantile_10_condition = (fpkm1_log2 < quantile_10) | (fpkm2_log2 <
quantile_10)
log2_mean_gt_0 = log2_mean > 0
correlation_record['file1_log2_mean_gt_0'] = len(
fpkm1_log2[log2_mean_gt_0])
correlation_record['file2_log2_mean_gt_0'] = len(
fpkm2_log2[log2_mean_gt_0])
correlation_record['file1_gt_0_and_01_quantile'] = len(
fpkm1_log2[log2_mean_gt_0 & quantile_01_condition])
correlation_record['file2_gt_0_and_01_quantile'] = len(
fpkm2_log2[log2_mean_gt_0 & quantile_01_condition])
correlation_record['file1_gt_0_and_1_quantile'] = len(
fpkm1_log2[log2_mean_gt_0 & quantile_1_condition])
correlation_record['file2_gt_0_and_1_quantile'] = len(
fpkm2_log2[log2_mean_gt_0 & quantile_1_condition])
correlation_record['file1_gt_0_and_10_quantile'] = len(
fpkm1_log2[log2_mean_gt_0 & quantile_10_condition])
correlation_record['file2_gt_0_and_10_quantile'] = len(
fpkm2_log2[log2_mean_gt_0 & quantile_10_condition])
correlation_record['FPKM_log2_mean_gt_0_pearson'] = get_pearson(
fpkm1_log2[log2_mean_gt_0], fpkm2_log2[log2_mean_gt_0])
correlation_record['FPKM_log2_mean_gt_0_spearman'] = get_spearman(
fpkm1_log2[log2_mean_gt_0], fpkm2_log2[log2_mean_gt_0])
correlation_record['FPKM_log2_mean_gt_0_pearson_01_pct'] = get_pearson(
fpkm1_log2[log2_mean_gt_0 & quantile_01_condition],
fpkm2_log2[log2_mean_gt_0 & quantile_01_condition])
correlation_record['FPKM_log2_mean_gt_0_spearman_01_pct'] = get_spearman(
fpkm1_log2[log2_mean_gt_0 & quantile_01_condition],
fpkm2_log2[log2_mean_gt_0 & quantile_01_condition])
correlation_record['FPKM_log2_mean_gt_0_pearson_1_pct'] = get_pearson(
fpkm1_log2[log2_mean_gt_0 & quantile_1_condition],
fpkm2_log2[log2_mean_gt_0 & quantile_1_condition])
correlation_record['FPKM_log2_mean_gt_0_spearman_1_pct'] = get_spearman(
fpkm1_log2[log2_mean_gt_0 & quantile_1_condition],
fpkm2_log2[log2_mean_gt_0 & quantile_1_condition])
correlation_record['FPKM_log2_mean_gt_0_pearson_10_pct'] = get_pearson(
fpkm1_log2[log2_mean_gt_0 & quantile_10_condition],
fpkm2_log2[log2_mean_gt_0 & quantile_10_condition])
correlation_record['FPKM_log2_mean_gt_0_spearman_10_pct'] = get_spearman(
fpkm1_log2[log2_mean_gt_0 & quantile_10_condition],
fpkm2_log2[log2_mean_gt_0 & quantile_10_condition])
return correlation_record
if __name__ == '__main__':
with open('logger_config.json') as f:
logger_config = json.load(f)
dictConfig(logger_config)
logger = logging.getLogger(__name__)
nthreads = int(sys.argv[1])
logger.info('Getting madQC metadata objects.')
mad_request = requests.get(ENCODE_BASE_URL + MAD_SEARCH_URL)
mad_request.raise_for_status()
mad_data = mad_request.json()
logger.info('Getting experiment metadata objects.')
experiment_request = requests.get(ENCODE_BASE_URL + EXPERIMENT_SEARCH_URL)
experiment_request.raise_for_status()
experiment_data = experiment_request.json()
logger.info('Building file -> experiment lookup mapping.')
file_to_experiment_lookup = build_file_to_experiment_data_mapping(
experiment_data)
logger.info(
'Built mapping for {} files'.format(len(file_to_experiment_lookup)))
results = []
mad_graph = mad_data['@graph']
logger.info('Start building correlation report.')
before = time.time()
def worker():
while True:
try:
token = mad_graph.pop()
except IndexError:
# done
break
logger.info('Handling {}'.format(token['@id']))
results.append(
build_record_of_correlation_metrics_from_madqc_obj(
token, file_to_experiment_lookup))
threads = []
for _ in range(nthreads):
t = threading.Thread(target=worker)
t.start()
threads.append(t)
for process in threads:
process.join()
after = time.time()
logger.info('Built report. It took {}'.format(after - before))
results = [result for result in results if result is not None]
result_df = pd.DataFrame(results)
result_df.to_csv('mad_final.tsv', sep='\t')