-
Notifications
You must be signed in to change notification settings - Fork 2
/
LDA.py
124 lines (92 loc) · 4.66 KB
/
LDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
"""
https://machinelearningmastery.com/linear-discriminant-analysis-with-python/
LDA description:
It is a linear classification algorithm, like logistic regression.
This means that classes are separated in the feature space by lines or hyperplanes.
It works by calculating summary statistics for the input features by class label,
such as the mean and standard deviation. These statistics represent the model learned
from the training data.
Predictions are made by estimating the probability that a new example belongs to each class
label based on the values of each input feature. The class that results in the largest probability
is then assigned to the example. As such, LDA may be considered a simple application of Bayes Theorem for classification.
Data assumptions:
- the observations within each class come from a normal distribution with a class-specific mean vector and a common variance
- it also assumes that the input variables are not correlated
Preprocessing:
- we recommend that predictors be centered and scaled and that near-zero variance predictors be removed
"""
#=========================================================================
# evaluate a lda model on the dataset on a synthetic dataset
#=========================================================================
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from numpy import arange
from sklearn.model_selection import GridSearchCV
# define a synthetic classification dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, random_state=1)
# define model
model = LinearDiscriminantAnalysis()
# define model evaluation method
# we can repeat the k-fold cross-validation process multiple times and report the mean performance across all folds and all repeats
# Stratified will balance the samples in each fold to an even number
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
#=========================================================================
# make a prediction with the LDA model on the dataset
#=========================================================================
from sklearn.datasets import make_classification
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, random_state=1)
# define model
model = LinearDiscriminantAnalysis()
# fit model
model.fit(X, y)
# define new data
row = [0.12777556,-3.64400522,-2.23268854,-1.82114386,1.75466361,0.1243966,1.03397657,2.35822076,1.01001752,0.56768485]
# make a prediction
yhat = model.predict([row])
# summarize prediction
print('Predicted Class: %d' % yhat)
#=========================================================================
# Tune hyper paramters: use different solvers ['svd', 'lsqr', 'eigen']
#=========================================================================
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)
#=========================================================================
# Tune hyper paramters: use shrinkage
#=========================================================================
#Shrinkage adds a penalty to the model that acts as a type of regularizer, reducing the complexity of the model.
#Regularization reduces the variance associated with the sample based estimate at the expense of potentially increased bias.
# define model
model = LinearDiscriminantAnalysis(solver='lsqr') #specific solvers are required to use shrinkage
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['shrinkage'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)