-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml_utils.py
936 lines (773 loc) · 41.4 KB
/
ml_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
"""
This script is responsible for allocating custom function and classes to be used on Machine Learning and Data Science
projects. The code below can be used to make training and evaluating model process easier to developers so they
can focus on model improvement and code structure.
--- SUMMARY ---
1. Classification
1.1 Classifiers Analysis
2. Clustering
"""
# Importing libraries
import pandas as pd
import numpy as np
import itertools
import time
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from viz_utils import format_spines, AnnotateBars
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, cross_val_predict, learning_curve
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
import shap
from sklearn.cluster import KMeans
"""
-----------------------------------
-------- 1. CLASSIFICATION --------
1.1 Classifiers Analysis
-----------------------------------
"""
class BinaryClassifiersAnalysis:
"""
This class makes the work on training binary classifiers easier by bringing useful functions for training,
purposing search on hyperparameters space, evaluating metrics and much more
"""
def __init__(self):
self.classifiers_info = {}
def fit(self, classifiers, X, y, approach='', random_search=False, scoring='roc_auc', cv=5, verbose=5, n_jobs=-1):
"""
This function receives information from classifiers to be trained, the data to be used on training and other
parameters for fitting the model to the data.
Parameters
----------
:param classifiers: dictionary containing estimators and hyperparameters inner dict [type: dict]
:param X: object containing features already prepared for training the model [type: np.array]
:param y: object containing the model target variable [type: np.array]
:param approach: string to be added on model's name as sufix for identifying purposes [type: string, default: '']
:param random_search: guides the application of Randomized Search on training [type: bool, default: True]
:param scoring: scoring metric to be optimized on random search [type: string, default: 'roc_auc']
:param cv: K-folds used on random search cross-validation [type: int, default: 5]
:param verbose: verbose param from RandomizedSearchCV [type: int, default: 5]
:param n_jobs: n_jobs param from RandomizedSarchCV [type: int, default: -1]
Return
------
This method don't return anything but it fills some class attributes like self.classifiers_info dict
Application
-----------
# Creating dictionary object for storing models information
set_classifiers = {
'ModelName': {
'model': ClassifierEstimator(),
'params': clf_dict_params
}
}
trainer = BinaryClassifierAnalysis()
trainer.fit(set_classifiers, X_train_prep, y_train, random_search=True, cv=5)
"""
# Iterating trough every model in the dictionary of classifiers
for model_name, model_info in classifiers.items():
clf_key = model_name + approach
print(f'Training model {clf_key}\n')
# Creating an empty dict for storing model information
self.classifiers_info[clf_key] = {}
# Application of RandomizedSearchCV
if random_search:
rnd_search = RandomizedSearchCV(model_info['model'], model_info['params'], scoring=scoring, cv=cv,
verbose=verbose, random_state=42, n_jobs=n_jobs)
rnd_search.fit(X, y)
# Saving the best estimator into the model's dict
self.classifiers_info[clf_key]['estimator'] = rnd_search.best_estimator_
else:
self.classifiers_info[clf_key]['estimator'] = model_info['model'].fit(X, y)
def compute_train_performance(self, model_name, estimator, X, y, cv=5):
"""
This function applies cross validation to retrieve useful metrics for the classification model.
In practice, this function would be called by another one (usually with compute_test_performance as well)
Parameters
----------
:param model_name: key-string that identifies a model at self.classifiers_info dict [type: string]
:param estimator:
:param X: object containing features already prepared for training the model [type: np.array]
:param y: object containing the model target variable [type: np.array]
:param cv: k-folds for cross validation application on training evaluation
Return
------
:return train_performance: DataFrame containing model metrics calculated using cross validation
Application
-----------
# Evaluating training performance using cross-validation
df_performances = trainer.compute_train_performance('DecisionTrees', trained_model, X_train, y_train, cv=5)
"""
# Computing metrics using cross validation
t0 = time.time()
accuracy = cross_val_score(estimator, X, y, cv=cv, scoring='accuracy').mean()
precision = cross_val_score(estimator, X, y, cv=cv, scoring='precision').mean()
recall = cross_val_score(estimator, X, y, cv=cv, scoring='recall').mean()
f1 = cross_val_score(estimator, X, y, cv=cv, scoring='f1').mean()
# Probas for calculating AUC
try:
y_scores = cross_val_predict(estimator, X, y, cv=cv, method='decision_function')
except:
# Tree based models don't have 'decision_function()' method, but 'predict_proba()'
y_probas = cross_val_predict(estimator, X, y, cv=cv, method='predict_proba')
y_scores = y_probas[:, 1]
auc = roc_auc_score(y, y_scores)
# Saving scores on self.classifiers_info dictionary
self.classifiers_info[model_name]['train_scores'] = y_scores
# Creating a DataFrame with metrics
t1 = time.time()
delta_time = t1 - t0
train_performance = {}
train_performance['model'] = model_name
train_performance['approach'] = f'Treino {cv} K-folds'
train_performance['acc'] = round(accuracy, 4)
train_performance['precision'] = round(precision, 4)
train_performance['recall'] = round(recall, 4)
train_performance['f1'] = round(f1, 4)
train_performance['auc'] = round(auc, 4)
train_performance['total_time'] = round(delta_time, 3)
return pd.DataFrame(train_performance, index=train_performance.keys()).reset_index(drop=True).loc[:0, :]
def compute_test_performance(self, model_name, estimator, X, y):
"""
This function retrieves metrics from the trained model on test data.
In practice, this function would be called by another one (usually with compute_train_performance as well)
Parameters
----------
:param model_name: key-string that identifies a model at self.classifiers_info dict [type: string]
:param estimator:
:param X: object containing features already prepared for training the model [type: np.array]
:param y: object containing the model target variable [type: np.array]
Return
------
:return test_performance: DataFrame containing model metrics calculated on test data
Application
-----------
# Evaluating test data performance
df_performances = trainer.compute_test_performance('DecisionTrees', trained_model, X_train, y_train)
"""
# Predicting data using the trained model and computing probabilities
t0 = time.time()
y_pred = estimator.predict(X)
y_proba = estimator.predict_proba(X)
y_scores = y_proba[:, 1]
# Retrieving metrics using test data
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_scores)
# Saving probabilities on treined classifiers dictionary
self.classifiers_info[model_name]['test_scores'] = y_scores
# Creating a DataFrame with metrics
t1 = time.time()
delta_time = t1 - t0
test_performance = {}
test_performance['model'] = model_name
test_performance['approach'] = f'Teste'
test_performance['acc'] = round(accuracy, 4)
test_performance['precision'] = round(precision, 4)
test_performance['recall'] = round(recall, 4)
test_performance['f1'] = round(f1, 4)
test_performance['auc'] = round(auc, 4)
test_performance['total_time'] = round(delta_time, 3)
return pd.DataFrame(test_performance, index=test_performance.keys()).reset_index(drop=True).loc[:0, :]
def evaluate_performance(self, X_train, y_train, X_test, y_test, cv=5, save=False, overwrite=True,
performances_filepath='model_performances.csv'):
"""
This function centralizes the evaluating metric process by calling train and test evaluation functions.
Parameters
----------
:param X_train: training data to be used on evaluation [np.array]
:param y_train: training target variable to be used on evaluation [type: np.array]
:param X_test: testing data to be used on evaluation [np.array]
:param y_test: testing target variable to be used on evaluation [type: np.array]
:param cv: K-folds used on cross validation step [type: int, default: 5]
:param save: flag that guides saving the final DataFrame with metrics [type: bool, default: False]
:param overwrite: flag that guides the overwriting of a saved metrics file [type: bool, default: True]
:param performances_filepath: path reference for saving model performances dataset [type: string,
default: 'model_performances.csv']
Return
------
:return df_performance: DataFrame containing model metrics calculated on training and test data
Application
-----------
# Evaluating performance on training and testint
df_performance = trainer.evaluate_performance(X_train, y_train, X_test, y_test, save=True)
"""
# Iterating over each trained classifier at classifiers_info dictionary
df_performances = pd.DataFrame({})
for model_name, model_info in self.classifiers_info.items():
# Validating if the model was already trained (the key 'train_performance' will be at model_info dict if so)
if 'train_performance' in model_info.keys():
df_performances = df_performances.append(model_info['train_performance'])
df_performances = df_performances.append(model_info['test_performance'])
continue
# Returning the estimator for calling the evaluation functions
print(f'Evaluating model {model_name}\n')
estimator = model_info['estimator']
# Retrieving training and testing metrics by calling inner functions
train_performance = self.compute_train_performance(model_name, estimator, X_train, y_train, cv=cv)
test_performance = self.compute_test_performance(model_name, estimator, X_test, y_test)
# Putting results on model's dictionary (classifiers_info)
self.classifiers_info[model_name]['train_performance'] = train_performance
self.classifiers_info[model_name]['test_performance'] = test_performance
# Building and unique DataFrame with performances retrieved
model_performance = train_performance.append(test_performance)
df_performances = df_performances.append(model_performance)
# Saving some attributes on model_info dictionary for further access
model_data = {
'X_train': X_train,
'y_train': y_train,
'X_test': X_test,
'y_test': y_test
}
model_info['model_data'] = model_data
# Saving the metrics file if applicable
if save:
# Adding information of measuring and execution time
cols_performance = list(df_performances.columns)
df_performances['anomesdia'] = datetime.now().strftime('%Y%m%d')
df_performances['anomesdia_datetime'] = datetime.now()
df_performances = df_performances.loc[:, ['anomesdia', 'anomesdia_datetime'] + cols_performance]
# Validating overwriting or append on data already saved
if overwrite:
df_performances.to_csv(performances_filepath, index=False)
else:
try:
# If overwrite is False, tries reading existing metrics data and applying append on it
log_performances = pd.read_csv(performances_filepath)
full_performances = log_performances.append(df_performances)
full_performances.to_csv(performances_filepath, index=False)
except FileNotFoundError:
print('Log de performances do modelo não existente no caminho especificado. Salvando apenas o atual.')
df_performances.to_csv(performances_filepath, index=False)
return df_performances
def feature_importance_analysis(self, features, specific_model=None, graph=True, ax=None, top_n=30,
palette='viridis', save=False, features_filepath='features_info.csv'):
"""
This function retrieves the feature importance from a given model. It can also build a bar chart
for top_n most important features and plot it on the notebook.
Paramters
---------
:param features: list of model features used on training [type: list]
:param specific_model: information that guides the returning of feature importance for a specific model*
:param graph: flag that guides bar chart plotting at the end of execution [type: bool, default: True]
:param ax: axis for plotting the bar chart [type: matplotlib.axes, default: None]
:param top_n: parameter for showing up just top most important features [type: int, default: 30]
:param palette: color configuration for feature importance bar chart [type: string, default: 'viridis']
:param save: flag for saving the dataset returned [type: bool, default: False]
:param features_filepath: path for saving the feature iportance dataset [type: string, default: 'features_info.csv']
Returns
-------
:return: model_feature_importance: pandas DataFrame with feature importances extracted by trained models
"""
# Iterating over each trained classifiers on classifiers_info dictionary
feat_imp = pd.DataFrame({})
all_feat_imp = pd.DataFrame({})
for model_name, model_info in self.classifiers_info.items():
# Creating a pandas DataFrame with model feature importance
try:
importances = model_info['estimator'].feature_importances_
except:
# If the given model doesn't have the feature_importances_ method, just continue for the next
continue
# Preparing the dataset with useful information
feat_imp['feature'] = features
feat_imp['importance'] = importances
feat_imp['anomesdia'] = datetime.now().strftime('%Y%m')
feat_imp['anomesdia_datetime'] = datetime.now()
feat_imp.sort_values(by='importance', ascending=False, inplace=True)
feat_imp.reset_index(drop=True, inplace=True)
# Saving the feature iportance at model's dictionary (classifiers_info)
self.classifiers_info[model_name]['feature_importances'] = feat_imp
all_feat_imp = all_feat_imp.append(feat_imp)
all_feat_imp['model'] = model_name
# Retrieving feature importance for a specific model
if specific_model is not None:
try:
model_feature_importance = self.classifiers_info[specific_model]['feature_importances']
if graph:
# Plotting the bar chart
sns.barplot(x='importance', y='feature', data=model_feature_importance.iloc[:top_n, :],
ax=ax, palette=palette)
format_spines(ax, right_border=False)
ax.set_title(f'Top {top_n} {specific_model} Features mais Relevantes', size=14, color='dimgrey')
# Saving features for a specific model
if save:
model_feature_importance['model'] = specific_model
order_cols = ['anomesdia', 'anomesdia_datetime', 'model', 'feature', 'importance']
model_feature_importance = model_feature_importance.loc[:, order_cols]
model_feature_importance.to_csv(features_filepath, index=False)
return model_feature_importance
except:
# Exception raised if the "specific_model" param doesn't match with any model's dictionary key
print(f'Classificador {specific_model} não existente nas chaves de classificadores treinados.')
print(f'Opções possíveis: {list(self.classifiers_info.keys())}')
return None
else:
# Validating feature importance saving if not passing specific_model param
if save:
order_cols = ['anomesdia', 'anomedia_datetime', 'model', 'feature', 'importance']
all_feat_imp = all_feat_imp.loc[:, order_cols]
all_feat_imp.to_csv(features_filepath, index=False)
return all_feat_imp
# Non-matching param combination (it can't be possible plotting bar chart for all models)
if graph and specific_model is None:
print('Por favor, escolha um modelo específico para visualizar o gráfico das feature importances')
return None
def plot_roc_curve(self, figsize=(16, 6)):
"""
This function iterates over each estimator in classifiers_info dictionary and plots the ROC Curve for
each one for training (first axis) and testing data (second axis)
Paramaters
----------
:param figsize: figure size for the plot [type: tuple, default: (16, 6)]
Returns
-------
This function doesn't return anything but the matplotlib plot for ROC Curve
Application
-----------
trainer.plot_roc_curve()
"""
# Creating matplotlib figure and axis for ROC Curve plot
fig, axs = plt.subplots(ncols=2, figsize=figsize)
# Iterating over trained models
for model_name, model_info in self.classifiers_info.items():
# Returning y data for the model (training and testing)
y_train = model_info['model_data']['y_train']
y_test = model_info['model_data']['y_test']
# Returning scores already calculated after performance evaluation
train_scores = model_info['train_scores']
test_scores = model_info['test_scores']
# Calculating false positives and true positives rate
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, train_scores)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, test_scores)
# Returning the auc metric for training and testing already calculated after model evaluation
train_auc = model_info['train_performance']['auc'].values[0]
test_auc = model_info['test_performance']['auc'].values[0]
# Plotting graph (training data)
plt.subplot(1, 2, 1)
plt.plot(train_fpr, train_tpr, linewidth=2, label=f'{model_name} auc={train_auc}')
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.02, 1.02, -0.02, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve - Train Data')
plt.legend()
# Plotting graph (testing data)
plt.subplot(1, 2, 2)
plt.plot(test_fpr, test_tpr, linewidth=2, label=f'{model_name} auc={test_auc}')
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.02, 1.02, -0.02, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve - Test Data', size=12)
plt.legend()
plt.show()
def custom_confusion_matrix(self, model_name, y_true, y_pred, classes, cmap, normalize=False):
"""
This function is used for plotting and customizing a confusion matrix for a specific model. In practice,
this function can be called by a top level one for plotting matrix for many models.
Parameters
----------
:param model_name: key reference for extracting model's estimator from classifiers_dict [type: string]
:param y_true: label reference for the target variable [type: np.array]
:param y_pred: array of predictions given by the respective model [type: np.array]
:param classes: alias for classes [type: string]
:param cmap: this parameters guides the colorway for the matrix [type: matplotlib.colormap]
:param normalize: normalizes the entries for the matrix [type: bool, default: False]
Returns
-------
:return: This functions doesn't return any object besides of plotting the confusion matrix
Application
-----------
Please refer to the self.plot_confusion_matrix() function
"""
# Returning a confusion matrix given the labels and predictions passed as args
conf_mx = confusion_matrix(y_true, y_pred)
# Plotando matriz
plt.imshow(conf_mx, interpolation='nearest', cmap=cmap)
plt.colorbar()
tick_marks = np.arange(len(classes))
# Customizando eixos
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
# Customizando entradas
fmt = '.2f' if normalize else 'd'
thresh = conf_mx.max() / 2.
for i, j in itertools.product(range(conf_mx.shape[0]), range(conf_mx.shape[1])):
plt.text(j, i, format(conf_mx[i, j]),
horizontalalignment='center',
color='white' if conf_mx[i, j] > thresh else 'black')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title(f'{model_name}\nConfusion Matrix', size=12)
def plot_confusion_matrix(self, classes, normalize=False, cmap=plt.cm.Blues):
"""
This function plots a confusion matrix for training and testing data for each classifier at
self.classifiers_dict dictionary
Parameters
----------
:param classes: labels for the target variable [type: string]
:param normalize: flag that guides the normalization of matrix values [type: bool, default: False]
:param cmap: param that colorizes the matrix [type: plt.cm, default: plt.cm.Blues]
Returns
-------
This function doesn't return anything but the matplotlib plot for confusion matrix
"""
# Defining parameters for ploting
k = 1
nrows = len(self.classifiers_info.keys())
fig = plt.figure(figsize=(10, nrows * 4))
sns.set(style='white', palette='muted', color_codes=True)
# Iterating over each classifier
for model_name, model_info in self.classifiers_info.items():
# Returning data from each model
X_train = model_info['model_data']['X_train']
y_train = model_info['model_data']['y_train']
X_test = model_info['model_data']['X_test']
y_test = model_info['model_data']['y_test']
# Making predictions for training (cross validation) and testing for returning confusion matrix
train_pred = cross_val_predict(model_info['estimator'], X_train, y_train, cv=5)
test_pred = model_info['estimator'].predict(X_test)
# Plotting matrix (training data)
plt.subplot(nrows, 2, k)
self.custom_confusion_matrix(model_name + ' Train', y_train, train_pred, classes=classes, cmap=cmap,
normalize=normalize)
k += 1
# Plotting matrix (testing data)
plt.subplot(nrows, 2, k)
self.custom_confusion_matrix(model_name + ' Test', y_test, test_pred, classes=classes, cmap=plt.cm.Greens,
normalize=normalize)
k += 1
plt.tight_layout()
plt.show()
def plot_learning_curve(self, model_name, ax, ylim=None, cv=5, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 10)):
"""
This function calculates and plots the learning curve for a trained model
Parameters
----------
:param model_name: Key reference for extracting an estimator from classifiers_dict dictionary [type: string]
:param ax: axis reference for plotting the learning curve [type: matplotlib.axis]
:param ylim: configuration of the limit on plot vertical axis [type: int, default: None]
:param cv: k-folds used on cross validation [type: int, default: 5]
:param n_jobs: number of cores used on retrieving the learning curve params [type: int, default: 1]
:param train_sizes: array that guides the steps bins used on learning curve [type: np.array,
default:np.linspace(.1, 1.0, 10)]
Returns
-------
This function doesn't return anything but the matplotlib plot for the learning curve
Application
-----------
# Plotting the learning curve for a specific model
fig, ax = plt.subplots(figsize=(16, 6))
trainer.plot_learning_curve(model_name='LightGBM', ax=ax)
"""
# Returning the model to be evaluated
try:
model = self.classifiers_info[model_name]
except:
print(f'Classificador {model_name} não foi treinado.')
print(f'Opções possíveis: {list(self.classifiers_info.keys())}')
return None
# Returning useful data for the model
X_train = model['model_data']['X_train']
y_train = model['model_data']['y_train']
# Calling the learning curve model for retrieving the scores for training and validation
train_sizes, train_scores, val_scores = learning_curve(model['estimator'], X_train, y_train, cv=cv,
n_jobs=n_jobs, train_sizes=train_sizes)
# Computing averages and standard deviation (training and validation)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)
# Results on training data
ax.plot(train_sizes, train_scores_mean, 'o-', color='navy', label='Training Score')
ax.fill_between(train_sizes, (train_scores_mean - train_scores_std), (train_scores_mean + train_scores_std),
alpha=0.1, color='blue')
# Results on cross validation
ax.plot(train_sizes, val_scores_mean, 'o-', color='red', label='Cross Val Score')
ax.fill_between(train_sizes, (val_scores_mean - val_scores_std), (val_scores_mean + val_scores_std),
alpha=0.1, color='crimson')
# Customizing graph
ax.set_title(f'Model {model_name} - Learning Curve', size=14)
ax.set_xlabel('Training size (m)')
ax.set_ylabel('Score')
ax.grid(True)
ax.legend(loc='best')
def plot_score_distribution(self, model_name, shade=False):
"""
This function plots a kdeplot for training and testing data splitting by target class
Parameters
----------
:param model_name: key reference for the trained model [type: string]
:param shade: shade param for seaborn's kdeplot [type: bool, default: False]
Returns
-------
This function doesn't return anything but the matplotlib plot for the score distribution
Application
-----------
# Ploting scores distribution for a model
plot_score_distribution(model_name='LightGBM', shade=True)
"""
# Returning the model to be evaluated
try:
model = self.classifiers_info[model_name]
except:
print(f'Classificador {model_name} não foi treinado.')
print(f'Opções possíveis: {list(self.classifiers_info.keys())}')
return None
# Retrieving y array for training and testing data
y_train = model['model_data']['y_train']
y_test = model['model_data']['y_test']
# Retrieving training and testing scores
train_scores = model['train_scores']
test_scores = model['test_scores']
# Plotting scores distribution
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 5))
sns.kdeplot(train_scores[y_train == 1], ax=axs[0], label='y=1', shade=shade, color='darkslateblue')
sns.kdeplot(train_scores[y_train == 0], ax=axs[0], label='y=0', shade=shade, color='crimson')
sns.kdeplot(test_scores[y_test == 1], ax=axs[1], label='y=1', shade=shade, color='darkslateblue')
sns.kdeplot(test_scores[y_test == 0], ax=axs[1], label='y=0', shade=shade, color='crimson')
# Customizing plots
format_spines(axs[0], right_border=False)
format_spines(axs[1], right_border=False)
axs[0].set_title('Score Distribution - Training Data', size=12, color='dimgrey')
axs[1].set_title('Score Distribution - Testing Data', size=12, color='dimgrey')
plt.suptitle(f'Score Distribution: a Probability Approach for {model_name}\n', size=14, color='black')
plt.show()
def plot_score_bins(self, model_name, bin_range):
"""
This function plots a score distribution based on quantity of each class in a specific bin_range set
Parameters
----------
:param model_name: key reference for the trained model [type: string]
:param bin_range: defines a range of splitting the bins array [type: float]
Returns
-------
This function doesn't return anything but the matplotlib plot for the score bins distribution
Application
-----------
# Ploting scores distribution for a model in another approach
plot_score_bins(model_name='LightGBM', bin_range=0.1)
"""
# Returning the model to be evaluated
try:
model = self.classifiers_info[model_name]
except:
print(f'Classificador {model_name} não foi treinado.')
print(f'Opções possíveis: {list(self.classifiers_info.keys())}')
return None
# Creating the bins array
bins = np.arange(0, 1.01, bin_range)
bins_labels = [str(round(list(bins)[i - 1], 2)) + ' a ' + str(round(list(bins)[i], 2)) for i in range(len(bins))
if i > 0]
# Retrieving the train scores and creating a DataFrame
train_scores = model['train_scores']
y_train = model['model_data']['y_train']
df_train_scores = pd.DataFrame({})
df_train_scores['scores'] = train_scores
df_train_scores['target'] = y_train
df_train_scores['faixa'] = pd.cut(train_scores, bins, labels=bins_labels)
# Computing the distribution for each bin
df_train_rate = pd.crosstab(df_train_scores['faixa'], df_train_scores['target'])
df_train_percent = df_train_rate.div(df_train_rate.sum(1).astype(float), axis=0)
# Retrieving the test scores and creating a DataFrame
test_scores = model['test_scores']
y_test = model['y_test']
df_test_scores = pd.DataFrame({})
df_test_scores['scores'] = test_scores
df_test_scores['target'] = y_test
df_test_scores['faixa'] = pd.cut(test_scores, bins, labels=bins_labels)
# Computing the distribution for each bin
df_test_rate = pd.crosstab(df_test_scores['faixa'], df_test_scores['target'])
df_test_percent = df_test_rate.div(df_test_rate.sum(1).astype(float), axis=0)
# Defining figure for plotting
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
# Plotting the bar chart for each bin
for df_scores, ax in zip([df_train_scores, df_test_scores], [axs[0, 0], axs[0, 1]]):
sns.countplot(x='faixa', data=df_scores, hue='target', ax=ax, palette=['darkslateblue', 'crimson'])
AnnotateBars(n_dec=0, color='dimgrey').vertical(ax)
ax.legend(loc='upper right')
format_spines(ax, right_border=False)
# Plotting percentage for each class in each bin
for df_percent, ax in zip([df_train_percent, df_test_percent], [axs[1, 0], axs[1, 1]]):
df_percent.plot(kind='bar', ax=ax, stacked=True, color=['darkslateblue', 'crimson'], width=0.6)
# Customizing plots
for p in ax.patches:
# Collecting params for labeling
height = p.get_height()
width = p.get_width()
x = p.get_x()
y = p.get_y()
# Formatting params and putting into the graph
label_text = f'{round(100 * height, 1)}%'
label_x = x + width - 0.30
label_y = y + height / 2
ax.text(label_x, label_y, label_text, ha='center', va='center', color='white',
fontweight='bold', size=10)
format_spines(ax, right_border=False)
# Final definitions
axs[0, 0].set_title('Quantity of each Class by Range - Train', size=12, color='dimgrey')
axs[0, 1].set_title('Quantity of each Class by Range - Test', size=12, color='dimgrey')
axs[1, 0].set_title('Percentage of each Class by Range - Train', size=12, color='dimgrey')
axs[1, 1].set_title('Percentage of each Class by Range - Test', size=12, color='dimgrey')
plt.suptitle(f'Score Distribution by Range - {model_name}\n', size=14, color='black')
plt.tight_layout()
plt.show()
def shap_analysis(self, model_name, features):
"""
This function brings a shap analysis for each feature into the model
Parameters
----------
:param model_name: key reference for the trained model [type: string]
:param features: features list for the model [type: list]
Returns
-------
This function doesn't return anything but the shap plot analysis
Application
-----------
# Executing a shap analysis
trainer.shap_analysis(model_name='LightGBM')
"""
# Returning the model to be evaluated
try:
model = self.classifiers_info[model_name]
except:
print(f'Classificador {model_name} não foi treinado.')
print(f'Opções possíveis: {list(self.classifiers_info.keys())}')
return None
# Retrieving training data
X_train = model['X_train']
# Applying shap approach
explainer = shap.TreeExplainer(model)
df_train = pd.DataFrame(X_train, columns=features)
shap_values = explainer.shap_values(df_train)
# Plotting a summary plot using shap
shap.summary_plot(shap_values[1], df_train)
def cross_val_performance(estimator, X, y, cv=5):
"""
This function applies cross validation to retrieve useful metrics for the classification model.
In practice, this function would be called by another one (usually with compute_test_performance as well)
Parameters
----------
:param estimator: a trained model to be used on evaluation [type: model]
:param X: object containing features already prepared for training the model [type: np.array]
:param y: object containing the model target variable [type: np.array]
:param cv: k-folds for cross validation application on training evaluation
Return
------
:return train_performance: DataFrame containing model metrics calculated using cross validation
Application
-----------
# Evaluating training performance using cross-validation
df_performances = trainer.compute_train_performance('DecisionTrees', trained_model, X_train, y_train, cv=5)
"""
# Computing metrics using cross validation
t0 = time.time()
accuracy = cross_val_score(estimator, X, y, cv=cv, scoring='accuracy').mean()
precision = cross_val_score(estimator, X, y, cv=cv, scoring='precision').mean()
recall = cross_val_score(estimator, X, y, cv=cv, scoring='recall').mean()
f1 = cross_val_score(estimator, X, y, cv=cv, scoring='f1').mean()
# Probas for calculating AUC
try:
y_scores = cross_val_predict(estimator, X, y, cv=cv, method='decision_function')
except:
# Tree based models don't have 'decision_function()' method, but 'predict_proba()'
y_probas = cross_val_predict(estimator, X, y, cv=cv, method='predict_proba')
y_scores = y_probas[:, 1]
auc = roc_auc_score(y, y_scores)
# Creating a DataFrame with metrics
t1 = time.time()
delta_time = t1 - t0
train_performance = {}
train_performance['model'] = estimator.__class__.__name__
train_performance['approach'] = 'Final Model'
train_performance['acc'] = round(accuracy, 4)
train_performance['precision'] = round(precision, 4)
train_performance['recall'] = round(recall, 4)
train_performance['f1'] = round(f1, 4)
train_performance['auc'] = round(auc, 4)
train_performance['total_time'] = round(delta_time, 3)
df_performance = pd.DataFrame(train_performance, index=train_performance.keys()).reset_index(drop=True).loc[:0, :]
# Adding information of measuring and execution time
cols_performance = list(df_performance.columns)
df_performance['anomesdia'] = datetime.now().strftime('%Y%m%d')
df_performance['anomesdia_datetime'] = datetime.now()
df_performance = df_performance.loc[:, ['anomesdia', 'anomesdia_datetime'] + cols_performance]
return df_performance
"""
-----------------------------------
---------- 2. CLUSTERING ----------
-----------------------------------
"""
def elbow_method_kmeans(df, K_min, K_max, figsize=(16, 6)):
"""
This function applies the elbow method using K-means clustering algorithm in order to select the best number of
k clusters for the task
Parameters
----------
:param df: DataFrame with data to be clustered [type: pd.DataFrame]
:param K_min: minimum index of K to be evaluated on elbow method [type: int]
:param K_max: maximum index of K to be evaluated on elbow method [type: int]
:param figsize: figure dimension for the matplotlib plot [type: tuple, default: (16, 5)]
Returns
-------
:return: This function doesn't return anything but the elbow method plot for further analysis
Application
-----------
# Looking at the elbow graphic for selection the best k number of clusters
elbow_method_kmeans(df=data, K_min=1, K_max=8)
"""
# Training differente K-Means models for each number of flusters in range K_min, K_max
square_dist = []
for k in range(K_min, K_max+1):
km = KMeans(n_clusters=k)
km.fit(df)
# Computing and keeping the square distances for each model configuration
square_dist.append(km.inertia_)
# Plotting elbow analysis
fig, ax = plt.subplots(figsize=figsize)
sns.lineplot(x=range(K_min, K_max), y=square_dist, color='cornflowerblue', marker='o')
# Customizing the graph
format_spines(ax, right_border=False)
ax.set_title('Elbow Method - KMeans Model', size=14, color='dimgrey')
ax.set_xlabel('Total of Clusters (K)')
ax.set_ylabel('Euclidian Distance')
plt.show()
def plot_kmeans_clusters_2d(df, k_means, figsize=(16, 6), cmap='viridis'):
"""
This function plots and customizes a 2D result from a trained K-means algorithm
Parameters
----------
:param df: DataFrame with the two columns used on training the K-means algorithm [type: pd.DataFrame]
:param kmeans: trained K-means model [type: model]
:param figsize: figure dimension for the plot [type: tuple, default: (16, 6)]
:param cmap: param for coloring the plot with a specific palette [type: string, default: 'viridis]
Returns
-------
:return: This function doesn't return anything but the K-means algorithm plot
Application
-----------
# Visualizing the kmeans result on a trained 2D model data
model = KMeans(n_clusters=3)
cluster_data = df.loc[:, two_columns_list]
k_means.fit(cluster_data)
plot_kmeans_clusters_2d(cluster_data, model)
"""
# Computing y_kmeans and centers
y_kmeans = k_means.predict(df)
centers = k_means.cluster_centers_
# REtrieving values and defining layout
variaveis = df.columns
X = df.values
sns.set(style='white', palette='muted', color_codes=True)
# Plotting the scatterplot chart
fig, ax = plt.subplots(figsize=figsize)
ax.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap=cmap)
ax.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
# Customizing the graph
ax.set_title(f'K_Means aplicado entre {variaveis[0].upper()} e {variaveis[1].upper()}', size=14, color='dimgrey')
format_spines(ax, right_border=False)
ax.set_xlabel(variaveis[0])
ax.set_ylabel(variaveis[1])
plt.show()