-
Notifications
You must be signed in to change notification settings - Fork 0
/
dp_utils.py
181 lines (133 loc) · 6.35 KB
/
dp_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
import pandas as pd
def is_way_smaller(x1, x2, epsilon=1e-12):
return (x1 + epsilon) < x2
def get_se(the_feat, the_y, the_sw, mimic_fn=LinearRegression):
if the_sw is not None and the_sw.sum() == 0.:
# Don't choose this path during dynamic programming
return np.inf, None
if the_feat.ndim == 1:
the_feat = the_feat.reshape(-1, 1)
reg = mimic_fn().fit(the_feat, the_y, sample_weight=the_sw)
y_pred = reg.predict(the_feat)
if the_sw is None:
the_sw = 1.
se = (((the_y - y_pred) ** 2) * the_sw).sum()
return se, reg
def run_dynamic_programming(feat, bin_feat, target, w, k=None, mimic_fn_type='linear'):
'''
feat: all the features of x
bin_feat: the binnarized feature of x (in EBM, at most 256 bins)
target: the corresponding value of y
w: sample weighted. Has the same size as (L)
'''
L = len(bin_feat)
if k is None:
k = len(bin_feat) + 1
mimic_fn = LinearRegression if mimic_fn_type == 'linear' else DummyRegressor
## dynamic programming algorithm
score_mat = -np.ones((k, L))
back_mat = -np.ones((k, L), dtype=int)
pred_mat = np.empty((k, L), dtype=object)
# Pre-caculate all the possible fitting error: totally l*(l-1) / 2 operations
# Only use the top half
pre_cached_error = {}
for i in range(L):
pre_cached_error[i] = {}
for j in range(i+1, L):
val_s, val_e = bin_feat[i], bin_feat[j]
criteria = ((val_s < feat) & (feat <= val_e))
the_feat = feat[criteria]
the_target = target[criteria[:, 0]]
the_sw = None if w is None else w[criteria[:, 0]]
the_se, _ = get_se(the_feat, the_target, the_sw, mimic_fn)
pre_cached_error[i][j] = the_se
# initalitargete k=0 for all the entry
for i in range(L):
criteria = (feat <= bin_feat[i])
the_feat = feat[criteria]
the_target = target[criteria[:, 0]]
the_sw = None if w is None else w[criteria[:, 0]]
score_mat[0, i], model = get_se(the_feat, the_target, the_sw, mimic_fn)
pred_mat[0, i] = None if model is None else model.predict(np.array(bin_feat[:(i+1)]).reshape(-1, 1))
# intialize when n=0, the error is the same
score_mat[:, 0], back_mat[:, 0] = score_mat[0, 0], 0
for j in range(k):
pred_mat[j, 0] = pred_mat[0, 0]
# DP
for idx_k in range(1, k):
for idx_n in range(idx_k, L):
# initialitargete as no cut
the_min_val, the_min_idx, the_min_pred = score_mat[idx_k-1, idx_n], idx_n, None
for prev_row_idx in range(idx_k-1, idx_n):
# val_s, val_e = bin_feat[prev_row_idx], bin_feat[idx_n]
# criteria = ((val_s < feat) & (feat <= val_e))
# the_feat = feat[criteria]
# the_target = target[criteria[:, 0]]
# the_sw = None if w is None else w[criteria[:, 0]]
# the_se, model = get_se(the_feat, the_target, the_sw, mimic_fn)
the_se = pre_cached_error[prev_row_idx][idx_n]
the_se += score_mat[idx_k-1, prev_row_idx]
if is_way_smaller(the_se, the_min_val):
the_min_val, the_min_idx = the_se, prev_row_idx
# the_min_pred = None if model is None else model.predict(np.array(bin_feat[prev_row_idx+1:idx_n+1]).reshape(-1, 1))
score_mat[idx_k, idx_n] = the_min_val
back_mat[idx_k, idx_n] = the_min_idx
# TODO: FIX the model prediction problem....
# Fit actual model and get the prediction
the_min_pred = None if model is None else model.predict(np.array(bin_feat[prev_row_idx+1:idx_n+1]).reshape(-1, 1))
pred_mat[idx_k, idx_n] = the_min_pred
# Stop when there is no more benefit to have more cuts
if back_mat[idx_k, L-1] == L-1:
break
return score_mat[:idx_k], back_mat[:idx_k], pred_mat[:idx_k]
def backtrace(score_mat, back_mat, pred_mat, num_cut_point):
'''
Return the seperated indexes and the squared error
'''
assert num_cut_point >= 0 and num_cut_point < back_mat.shape[0]
cut_idxes = []
idx_k, idx_n = num_cut_point, back_mat.shape[1]-1
all_preds = []
while idx_k > 0:
the_pred = pred_mat[idx_k, idx_n]
# print(idx_k, idx_n, the_pred.shape)
if the_pred is not None:
all_preds.append(the_pred)
idx_n = back_mat[idx_k, idx_n]
assert idx_n != -1, str(idx_n)
cut_idxes.append(idx_n)
idx_k = idx_k - 1
# print(idx_k, idx_n, pred_mat[idx_k, idx_n].shape)
the_pred = pred_mat[idx_k, idx_n]
if the_pred is not None:
all_preds.append(the_pred)
# all_idxes = [0] + [val+1 for val in cut_idxes[::-1]] + [back_mat.shape[1]]
se = score_mat[num_cut_point, -1]
return cut_idxes[::-1], se, np.hstack(all_preds[::-1])
def simplify_curve(x, target_y, sample_weight=None, mimic_fn_type='linear'):
'''
Simply the target curve.
Args:
x: the binned feature values with numpy array of shape (P, 1). E.g. [[18], [19], [20]]
target_y: the log-odds risk of the x. Numpy array of shape (P) E.g. [0.1, 0.5, 1.0]
sample_weight: the sample weight for each x value. None means no sample weighting.
mimic_fn_type: only supports "const" or "linear".
'''
assert isinstance(x, np.ndarray) and x.ndim == 2, 'x is not in the right format'
assert len(x) == len(target_y), 'x and y should have the same length'
assert mimic_fn_type in ['const', 'linear'], 'mimic fn type should be linear or const. But get %s' % mimic_fn_type
score_mat, back_mat, pred_mat = run_dynamic_programming(x, x, target_y, sample_weight, k=None, mimic_fn_type=mimic_fn_type)
results = []
K = len(score_mat)
for k in range(0, K):
cut_points, se, simplified = backtrace(score_mat, back_mat, pred_mat, k)
result = {}
result['se'] = se
result['cut_points'] = cut_points
result['simplified'] = simplified
result['k'] = k
results.append(result)
return pd.DataFrame(results)