-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathgood_dense_data.py
371 lines (284 loc) · 12.9 KB
/
good_dense_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
""" DisMod III Simulation Study - Good, Dense Data
Example Usage
-------------
To generate 20 replicates of the simulation data::
>>> import good_dense_data, fit_all, dismod3
>>> [good_dense_data.generate_disease_data('test_disease') for ii in range(20)]
To run the data, figure out the ids for the models just created, then do this::
>>> dismod3.add_covariates_to_disease_model(id)
>>> fit_all.fit_all(id)
To see how it worked::
>>> good_dense_data.measure_fit(id, 'test_disease')
"""
import sys
import os
import optparse
import csv
import pylab as pl
import numpy as np
import pymc as mc
import simplejson as json
import dismod3
import dismod3.utils
from dismod3.disease_json import DiseaseJson
import dismod3.gbd_disease_model as model
from dismod3 import NEARLY_ZERO
from dismod3.neg_binom_model import countries_for, population_by_age, regional_population
import random
GBD_PATH = os.getcwd() + '/../'
sys.path.append(GBD_PATH)
OUTPUT_PATH = GBD_PATH
def generate_disease_data(condition, cov):
""" Generate csv files with gold-standard disease data,
and somewhat good, somewhat dense disease data, as might be expected from a
condition that is carefully studied in the literature
"""
age_len = dismod3.MAX_AGE
ages = np.arange(age_len, dtype='float')
# incidence rate
i0 = .005 + .02 * mc.invlogit((ages - 44) / 3)
#i0 = np.maximum(0., .001 * (-.125 + np.ones_like(ages) + (ages / age_len)**2.))
# remission rate
#r = 0. * ages
r = .1 * np.ones_like(ages)
# excess-mortality rate
#f_init = .085 * (ages / 100) ** 2.5
SMR = 3. * np.ones_like(ages) - ages / age_len
# all-cause mortality-rate
mort = dismod3.get_disease_model('all-cause_mortality')
#age_intervals = [[a, a+9] for a in range(0, dismod3.MAX_AGE-4, 10)] + [[0, 100] for ii in range(1)]
age_intervals = [[a, a] for a in range(0, dismod3.MAX_AGE, 1)]
# TODO: take age structure from real data
sparse_intervals = dict([[region, random.sample(age_intervals, (ii**3 * len(age_intervals)) / len(countries_for)**3 / 1)] for ii, region in enumerate(countries_for)])
dense_intervals = dict([[region, random.sample(age_intervals, len(age_intervals)/2)] for ii, region in enumerate(countries_for)])
gold_data = []
noisy_data = []
for ii, region in enumerate(sorted(countries_for)):
if region == 'world':
continue
print region
sys.stdout.flush()
# introduce unexplained regional variation
#i = i0 * (1 + float(ii) / 21)
# or not
i = i0
for year in [1990, 2005]:
for sex in ['male', 'female']:
param_type = 'all-cause_mortality'
key = dismod3.gbd_key_for(param_type, region, year, sex)
m_all_cause = mort.mortality(key, mort.data)
# calculate excess-mortality rate from smr
f = (SMR - 1.) * m_all_cause
## compartmental model (bins S, C, D, M)
import scipy.linalg
from dismod3 import NEARLY_ZERO
from dismod3.utils import trim
SCDM = np.zeros([4, age_len])
p = np.zeros(age_len)
m = np.zeros(age_len)
SCDM[0,0] = 1.
SCDM[1,0] = 0.
SCDM[2,0] = 0.
SCDM[3,0] = 0.
p[0] = SCDM[1,0] / (SCDM[0,0] + SCDM[1,0] + NEARLY_ZERO)
m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1-NEARLY_ZERO)
for a in range(age_len - 1):
A = [[-i[a]-m[a], r[a] , 0., 0.],
[ i[a] , -r[a]-m[a]-f[a], 0., 0.],
[ m[a], m[a] , 0., 0.],
[ 0., f[a], 0., 0.]]
SCDM[:,a+1] = np.dot(scipy.linalg.expm(A), SCDM[:,a])
p[a+1] = SCDM[1,a+1] / (SCDM[0,a+1] + SCDM[1,a+1] + NEARLY_ZERO)
m[a+1] = m_all_cause[a+1] - f[a+1] * p[a+1]
# duration = E[time in bin C]
hazard = r + m + f
pr_not_exit = np.exp(-hazard)
X = np.empty(len(hazard))
X[-1] = 1 / hazard[-1]
for ii in reversed(range(len(X)-1)):
X[ii] = (pr_not_exit[ii] * (X[ii+1] + 1)) + (1 / hazard[ii] * (1 - pr_not_exit[ii]) - pr_not_exit[ii])
country = countries_for[region][0]
params = dict(age_intervals=age_intervals, condition=condition, gbd_region=region,
country=country, year=year, sex=sex, effective_sample_size=1000)
params['age_intervals'] = [[0,99]]
generate_and_append_data(gold_data, 'prevalence data', p, **params)
generate_and_append_data(gold_data, 'incidence data', i, **params)
generate_and_append_data(gold_data, 'excess-mortality data', f, **params)
generate_and_append_data(gold_data, 'remission data', r, **params)
generate_and_append_data(gold_data, 'duration data', X, **params)
# TODO: use this approach to age standardize all gold data, and then change it to get iX as a direct sum
params['age_intervals'] = [[0,99]]
iX = i * X * (1-p) * regional_population(key)
generate_and_append_data(gold_data, 'incidence_x_duration', iX, **params)
params['effective_sample_size'] = 1000
params['cov'] = 0.
params['age_intervals'] = age_intervals
generate_and_append_data(noisy_data, 'prevalence data', p, **params)
generate_and_append_data(noisy_data, 'excess-mortality data', f, **params)
generate_and_append_data(noisy_data, 'remission data', r, **params)
generate_and_append_data(noisy_data, 'incidence data', i, **params)
col_names = sorted(data_dict_for_csv(gold_data[0]).keys())
f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w')
csv_f = csv.writer(f_file, dialect='excel-tab')
csv_f.writerow(col_names)
for d in gold_data:
dd = data_dict_for_csv(d)
csv_f.writerow([dd[c] for c in col_names])
f_file.close()
f_name = OUTPUT_PATH + '%s_data.tsv' % condition
f_file = open(f_name, 'w')
csv_f = csv.writer(f_file, dialect='excel-tab')
csv_f.writerow(col_names)
for d in noisy_data:
dd = data_dict_for_csv(d)
csv_f.writerow([dd[c] for c in col_names])
f_file.close()
# upload data file
from dismod3.disease_json import dismod_server_login, twc, DISMOD_BASE_URL
dismod_server_login()
twc.go(DISMOD_BASE_URL + 'dismod/data/upload/')
twc.formvalue(1, 'tab_separated_values', open(f_name).read())
# TODO: find or set the model number for this model, set the
# expert priors and covariates, merge the covariate data into the
# model, and add the "ground truth" to the disease json
try:
url = twc.submit()
except Exception, e:
print e
def generate_and_append_data(data, data_type, truth, age_intervals, condition,
gbd_region, country, year, sex, effective_sample_size, cov=0.):
""" create simulated data"""
for a0, a1 in age_intervals:
d = { 'condition': condition,
'data_type': data_type,
'gbd_region': gbd_region,
'region': country,
'year_start': year,
'year_end': year,
'sex': sex,
'age_start': a0,
'age_end': a1,
'id': len(data),}
holdout = 0
d['ignore'] = holdout
d['test_set'] = holdout
ages = range(a0, a1 + 1)
if data_type == 'incidence_x_duration':
pop = 1. * np.ones_like(ages)
else:
pop = np.array([population_by_age[(country, str(year), sex)][a] for a in ages])
if np.sum(pop) > 0:
pop /= float(np.sum(pop)) # normalize the pop weights to sum to 1
else:
pop = np.ones_like(ages) / float(len(ages)) # for countries where pop is zero, fill in constant structure
d['age_weights'] = list(pop)
p0 = dismod3.utils.rate_for_range(truth, ages, pop)
d['truth'] = p0
if p0 == 0 or cov == 0:
p1 = p0
d['value'] = p1
d['effective_sample_size'] = effective_sample_size
else:
p1 = mc.rtruncnorm(p0, (cov/100. * p0)**-2, 0, np.inf)
assert not np.isnan(p1)
d['value'] = p1
d['standard_error'] = p0 * cov/100.
data.append(d)
def data_dict_for_csv(d):
c = {
'GBD Cause': d['condition'],
'Parameter': d['data_type'].replace('-', ' '),
'Country ISO3 Code': d['region'],
'Region': d['gbd_region'],
'Parameter Value': d['value'],
'Standard Error': d.get('standard_error', ''),
'Effective Sample Size': d.get('effective_sample_size', ''),
'Units': 1.0,
'Sex': d['sex'],
'Age Start': d['age_start'],
'Age End': d['age_end'],
'Year Start': d['year_start'],
'Year End': d['year_end'],
'Ignore': d['ignore'],
'Test Set': d['test_set'],
'Truth': d['truth'],
}
return c
def predict(type, dm, d):
for k in d.keys():
d[dismod3.utils.clean(k)] = d[k]
t = d['parameter'].replace(' data', '').replace(' ', '-')
r = d['region']
y = int(d['year_start'])
s = d['sex']
key = dismod3.gbd_key_for(t, r, y, s)
a0 = int(d['age_start'])
a1 = int(d['age_end'])
est_by_age = dm.get_mcmc(type, key)
if len(est_by_age) == 0:
return -99
ages = range(a0, a1 + 1)
#pop = np.ones(a1 + 1 - a0) / float(a1 + 1 - a0))
c = d['country_iso3_code']
if t == 'incidence_x_duration':
pop = 1. * np.ones_like(ages)
else:
pop = [population_by_age[(c, str(y), s)][a] for a in ages]
pop /= np.sum(pop) # normalize the pop weights to sum to 1
est = dismod3.utils.rate_for_range(est_by_age, ages, pop)
d['estimate %s' % type] = est
return est
def measure_fit(id, condition):
"""
Determine the RMSE, MAE, and Coverage of the fit stored in model specified by id
"""
print 'downloading model %d' % id
sys.stdout.flush()
dm = dismod3.load_disease_model(id)
#print 'loading gold-standard data'
gold_data = [d for d in csv.DictReader(open(OUTPUT_PATH + '%s_gold.tsv' % condition), dialect='excel-tab')]
#print 'comparing values'
abs_err = dict(incidence=[], prevalence=[], remission=[], duration=[], incidence_x_duration=[])
rel_err = dict(incidence=[], prevalence=[], remission=[], duration=[], incidence_x_duration=[])
coverage = dict(incidence=[], prevalence=[], remission=[], duration=[], incidence_x_duration=[])
for metric in [abs_err, rel_err, coverage]:
metric['excess mortality'] = []
for d in gold_data:
est = predict('mean', dm, d)
lb = predict('lower_ui', dm, d)
ub = predict('upper_ui', dm, d)
if est < 0:
continue
val = float(d['Truth'])
err = val - est
#if d['Age Start'] <= 50:
# continue
t = d['Parameter'].replace(' data', '')
abs_err[t].append(err)
if val > 0.:
rel_err[t].append(100 * err / val)
coverage[t].append(val >= lb and val <= ub)
for k in abs_err:
print '%s abs RMSE = %f' % (k, np.sqrt(np.mean(np.array(abs_err[k])**2)))
print '%s abs MAE = %f' % (k, np.median(np.abs(abs_err[k])))
print
for k in rel_err:
print '%s rel pct RMSE = %f' % (k, np.sqrt(np.mean(np.array(rel_err[k])**2)))
print '%s rel pct MAE = %f' % (k, np.median(np.abs(rel_err[k])))
print
for k in coverage:
print '%s coverage = %f' % (k, np.sum(coverage[k]) * 100. / len(coverage[k]))
print
k = 'incidence_x_duration'
print '%s rel pct MAE =\t%f' % (k, np.median(np.abs(rel_err[k])))
return np.median(np.abs(rel_err[k]))
# add estimate value as a column in the gold data tsv, for looking
# in more detail with a spreadsheet or different code
col_names = sorted(set(gold_data[0].keys()) | set(['Estimate Value']))
f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w')
csv_f = csv.writer(f_file, dialect='excel-tab')
csv_f.writerow(col_names)
csv_f = csv.DictWriter(f_file, col_names, dialect='excel-tab')
for d in gold_data:
csv_f.writerow(d)
f_file.close()