-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlibrary.py
320 lines (266 loc) · 14.7 KB
/
library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import pandas as pd
import numpy as np
import subprocess
import sys
subprocess.call([sys.executable, '-m', 'pip', 'install', 'category_encoders']) #replaces !pip install
import category_encoders as ce
import sklearn
sklearn.set_config(transform_output="pandas") #pass pandas tables through pipeline instead of numpy matrices
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
titanic_variance_based_split = 107
customer_variance_based_split = 113
stroke_variance_based_split = 33
class CustomMappingTransformer(BaseEstimator, TransformerMixin):
def __init__(self, mapping_column, mapping_dict:dict):
assert isinstance(mapping_dict, dict), f'{self.__class__.__name__} constructor expected dictionary but got {type(mapping_dict)} instead.'
self.mapping_dict = mapping_dict
self.mapping_column = mapping_column
def fit(self, X, y = None):
print(f"\nWarning: {self.__class__.__name__}.fit does nothing.\n")
return self
def transform(self, X):
assert isinstance(X, pd.core.frame.DataFrame), f'{self.__class__.__name__}.transform expected Dataframe but got {type(X)} instead.'
assert self.mapping_column in X.columns.to_list(), f'{self.__class__.__name__}.transform unknown column "{self.mapping_column}"' #column legit?
#Set up for producing warnings. First have to rework nan values to allow set operations to work.
#In particular, the conversion of a column to a Series, e.g., X[self.mapping_column], transforms nan values in strange ways that screw up set differencing.
#Strategy is to convert empty values to a string then the string back to np.nan
placeholder = "NaN"
column_values = X[self.mapping_column].fillna(placeholder).tolist() #Convert all nan values to the string "NaN" in new list
column_values = [np.nan if v == placeholder else v for v in column_values] #Now convert back to np.nan
keys_values = self.mapping_dict.keys()
column_set = set(column_values) #Without the conversion above, the set will fail to have np.nan values where they should be
keys_set = set(keys_values) #This will have np.nan values where they should be so no conversion necessary
#Verify all keys are contained in column
keys_not_found = keys_set - column_set
if keys_not_found:
print(f"\nWarning: {self.__class__.__name__}[{self.mapping_column}] these mapping keys do not appear in the column: {keys_not_found}\n")
#Verify if keys are absent
keys_absent = column_set - keys_set
if keys_absent:
print(f"\nWarning: {self.__class__.__name__}[{self.mapping_column}] these values in the column do not contain corresponding mapping keys: {keys_absent}\n")
#Actual mapping
X_ = X.copy()
X_[self.mapping_column].replace(self.mapping_dict, inplace=True)
return X_
def fit_transform(self, X, y = None):
#self.fit(X,y)
result = self.transform(X)
return result
#This class will rename one or more columns.
class CustomRenamingTransformer(BaseEstimator, TransformerMixin):
def __init__(self, rename_dict:dict):
assert isinstance(rename_dict, dict), f'{self.__class__.__name__} constructor expected dictionary for rename_dict but got {type(rename_dict)} instead.'
self.rename_dict = rename_dict
def fit(self, X, y = None):
print(f"\nWarning: {self.__class__.__name__}.fit does nothing.\n")
return self
def transform(self, X):
assert isinstance(X, pd.core.frame.DataFrame), f'{self.__class__.__name__}.transform expected Dataframe but got {type(X)} instead.'
missing_columns = [key for key in self.rename_dict.keys() if key not in X.columns.to_list()]
if missing_columns:
raise AssertionError(f'{self.__class__.__name__}: cannot rename unknown columns: {missing_columns}')
X_ = X.copy()
X_.rename(columns=self.rename_dict, inplace=True)
return X_
def fit_transform(self, X, y = None):
#self.fit(X,y)
result = self.transform(X)
return result
#This class will perform a One Hot Encoding
class CustomOHETransformer(BaseEstimator, TransformerMixin):
def __init__(self, target_column, dummy_na=False, drop_first=False):
self.target_column = target_column
self.dummy_na = dummy_na
self.drop_first = drop_first
def fit(self, X, y = None):
print(f"\nWarning: {self.__class__.__name__}.fit does nothing.\n")
return self
def transform(self, X):
assert isinstance(X, pd.core.frame.DataFrame), f'{self.__class__.__name__}.transform expected Dataframe but got {type(X)} instead.'
assert self.target_column in X.columns.to_list(), f'{self.__class__.__name__}.transform unknown column "{self.target_column}"'
X_ = X.copy()
X_return = pd.get_dummies(X_,
prefix=self.target_column,
prefix_sep='_',
columns=[f'{self.target_column}'],
dummy_na=self.dummy_na,
drop_first=self.drop_first
)
return X_return
def fit_transform(self, X, y = None):
#self.fit(X,y)
result = self.transform(X)
return result
class CustomSigma3Transformer(BaseEstimator, TransformerMixin):
def __init__(self, target_column):
self.target_column = target_column
self.three_sigma = None
def fit(self, X, y = None):
assert isinstance(X, pd.core.frame.DataFrame), f'expected Dataframe but got {type(X)} instead.'
assert self.target_column in X.columns, f'unknown column {self.target_column}'
assert all([isinstance(v, (int, float)) for v in X[self.target_column].to_list()])
mean = X[self.target_column].mean()
std_dev = X[self.target_column].std()
self.three_sigma = (mean-3*std_dev, mean+3*std_dev)
def transform(self, X):
assert self.three_sigma is not None, f'NotFittedError: This {self.__class__.__name__} instance is not fitted yet. Call "fit" with appropriate arguments before using this estimator.'
transformed_df = X.copy()
transformed_df[self.target_column] = transformed_df[self.target_column].clip(lower=self.three_sigma[0], upper=self.three_sigma[1])
df_clean = transformed_df.reset_index(drop=True)
return df_clean
def fit_transform(self, X, y = None):
self.fit(X,y)
result = self.transform(X)
return result
class CustomTukeyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, target_column, fence='outer'):
assert fence in ['inner', 'outer']
self.target_column = target_column
self.fence = fence
self.boundaries = None
def fit(self, X, y = None):
assert isinstance(X, pd.core.frame.DataFrame), f'expected Dataframe but got {type(X)} instead.'
assert self.target_column in X.columns, f'unknown column {self.target_column}'
assert all([isinstance(v, (int, float)) for v in X[self.target_column].to_list()])
q1 = X[self.target_column].quantile(0.25)
q3 = X[self.target_column].quantile(0.75)
iqr = q3-q1
fence_values = {'outer': 3, 'inner': 1.5}
fence_multiplier = fence_values.get(self.fence)
low = q1 - fence_multiplier * iqr
high = q3 + fence_multiplier * iqr
self.boundaries = (low, high)
return self
def transform(self, X):
assert self.boundaries is not None, f'NotFittedError: This {self.__class__.__name__} instance is not fitted yet. Call "fit" with appropriate arguments before using this estimator.'
transformed_df = X.copy()
transformed_df[self.target_column] = transformed_df[self.target_column].clip(lower=self.boundaries[0], upper=self.boundaries[1])
df_clean = transformed_df.reset_index(drop=True)
return df_clean
def fit_transform(self, X, y = None):
self.fit(X,y)
result = self.transform(X)
return result
class CustomRobustTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.target_column = column
self.iqr = None
self.median = None
def fit(self, X, y = None):
assert isinstance(X, pd.core.frame.DataFrame), f'expected Dataframe but got {type(X)} instead.'
assert self.target_column in X.columns, f'unknown column {self.target_column}'
assert all([isinstance(v, (int, float)) for v in X[self.target_column].to_list()])
self.iqr = X[self.target_column].quantile(.75) - X[self.target_column].quantile(.25)
self.median = X[self.target_column].median()
return self
def transform(self, X):
assert self.iqr is not None and self.median is not None, f'NotFittedError: This {self.__class__.__name__} instance is not fitted yet. Call "fit" with appropriate arguments before using this estimator.'
transformed_df = X.copy()
transformed_df[self.target_column] -= self.median
transformed_df[self.target_column] /= self.iqr
return transformed_df
def fit_transform(self, X, y = None):
self.fit(X,y)
result = self.transform(X)
return result
def find_random_state(features_df, labels, n=200):
model = KNeighborsClassifier(n_neighbors=5) #instantiate with k=5.
var = [] #collect test_error/train_error where error based on F1 score
for i in range(1, n):
train_X, test_X, train_y, test_y = train_test_split(features_df, labels, test_size=0.2, shuffle=True,
random_state=i, stratify=labels)
model.fit(train_X, train_y) #train model
train_pred = model.predict(train_X) #predict against training set
test_pred = model.predict(test_X) #predict against test set
train_f1 = f1_score(train_y, train_pred) #F1 on training predictions
test_f1 = f1_score(test_y, test_pred) #F1 on test predictions
if test_f1 * train_f1 == 0: #skip this case
continue
f1_ratio = test_f1/train_f1 #take the ratio
var.append(f1_ratio)
rs_value = sum(var)/len(var) #get average ratio value
idx = np.array(abs(var - rs_value)).argmin() #find the index of the smallest value
return idx
titanic_transformer = Pipeline(steps=[
('map_gender', CustomMappingTransformer('Gender', {'Male': 0, 'Female': 1})),
('map_class', CustomMappingTransformer('Class', {'Crew': 0, 'C3': 1, 'C2': 2, 'C1': 3})),
('target_joined', ce.TargetEncoder(cols=['Joined'],
handle_missing='return_nan', #will use imputer later to fill in
handle_unknown='return_nan' #will use imputer later to fill in
)),
('tukey_age', CustomTukeyTransformer(target_column='Age', fence='outer')),
('tukey_fare', CustomTukeyTransformer(target_column='Fare', fence='outer')),
('scale_age', CustomRobustTransformer('Age')),
('scale_fare', CustomRobustTransformer('Fare')),
('imputer', KNNImputer(n_neighbors=5, weights="uniform", add_indicator=False))
], verbose=True)
customer_transformer = Pipeline(steps=[
('map_os', CustomMappingTransformer('OS', {'Android': 0, 'iOS': 1})),
('target_isp', ce.TargetEncoder(cols=['ISP'],
handle_missing='return_nan', #will use imputer later to fill in
handle_unknown='return_nan' #will use imputer later to fill in
)),
('map_level', CustomMappingTransformer('Experience Level', {'low': 0, 'medium': 1, 'high':2})),
('map_gender', CustomMappingTransformer('Gender', {'Male': 0, 'Female': 1})),
('tukey_age', CustomTukeyTransformer('Age', 'inner')),
('tukey_time spent', CustomTukeyTransformer('Time Spent', 'inner')),
('scale_age', CustomRobustTransformer('Age')),
('scale_time spent', CustomRobustTransformer('Time Spent')),
('impute', KNNImputer(n_neighbors=5, weights="uniform", add_indicator=False)),
], verbose=True)
def dataset_setup(original_table, label_column_name:str, the_transformer, rs, ts=.2):
# Extract features and labels from original table
features = original_table.drop(columns=f'{label_column_name}')
labels = original_table[f'{label_column_name}'].to_list()
# Split the dataset into a training and test set
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=ts, shuffle=True,
random_state=rs, stratify=labels)
# Prepare the training and test data
X_train_transformed_numpy = the_transformer.fit_transform(X_train, y_train).to_numpy()
X_test_transformed_numpy = the_transformer.transform(X_test).to_numpy()
y_train_numpy = np.array(y_train)
y_test_numpy = np.array(y_test)
return X_train_transformed_numpy, X_test_transformed_numpy, y_train_numpy, y_test_numpy
def titanic_setup(titanic_table, transformer=titanic_transformer, rs=titanic_variance_based_split, ts=.2):
return dataset_setup(titanic_table, 'Survived', transformer, rs=rs, ts=ts)
def customer_setup(customer_table, transformer=customer_transformer, rs=customer_variance_based_split, ts=.2):
return dataset_setup(customer_table, 'Rating', transformer, rs=rs, ts=ts)
def threshold_results(thresh_list, actuals, predicted):
result_df = pd.DataFrame(columns=['threshold', 'precision', 'recall', 'f1', 'accuracy'])
for t in thresh_list:
yhat = [1 if v >=t else 0 for v in predicted]
#note: where TP=0, the Precision and Recall both become 0. And I am saying return 0 in that case.
precision = precision_score(actuals, yhat, zero_division=0)
recall = recall_score(actuals, yhat, zero_division=0)
f1 = f1_score(actuals, yhat)
accuracy = accuracy_score(actuals, yhat)
result_df.loc[len(result_df)] = {'threshold':t, 'precision':precision, 'recall':recall, 'f1':f1, 'accuracy':accuracy}
result_df = result_df.round(2)
#Next bit fancies up table for printing. See https://betterdatascience.com/style-pandas-dataframes/
#Note that fancy_df is not really a dataframe. More like a printable object.
headers = {
"selector": "th:not(.index_name)",
"props": "background-color: #800000; color: white; text-align: center"
}
properties = {"border": "1px solid black", "width": "65px", "text-align": "center"}
fancy_df = result_df.style.format(precision=2).set_properties(**properties).set_table_styles([headers])
return (result_df, fancy_df)
def halving_search(model, grid, x_train, y_train, factor=2, min_resources="exhaust", scoring='roc_auc'):
halving_cv = HalvingGridSearchCV(
model, grid, #our model and the parameter combos we want to try
scoring=scoring,
n_jobs=-1, #use all available cpus
min_resources=min_resources, #"exhaust" sets this to 20, which is non-optimal. Possible bug in algorithm.
factor=factor,
cv=5, random_state=1234,
refit=True, #remembers the best combo and gives us back that model already trained and ready for testing
)
grid_result = halving_cv.fit(x_train, y_train)
return grid_result