-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path14_logistic_regression.py
173 lines (144 loc) · 5.67 KB
/
14_logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# To add a new cell, type '#%%'
# To add a new markdown cell, type '#%% [markdown]'
#%%
from IPython import get_ipython
#%% [markdown]
#<h1> SIT 720 - Python Intro </h1>
#%% [markdown]
# <h2>Logistic regression in python</h2>
#%%
#Load data
examData = pd.read_csv('/Users/ragyibrahim/Downloads/ExamScores.csv')
rows, cols = examData.shape
print('Data has {} rows and {} cols'.format(rows, cols))
examData.head()
#%%
#Split data into trainig (70%) and testing (30%)
from sklearn.model_selection import train_test_split
trainData, testData = train_test_split(examData, test_size = 0.3)
print(trainData.shape)
print(testData.shape)
#%%
#Split dependant and independant variables
examPredictors = ['Exam1', 'Exam2']
examResponse = ['Admit']
print(trainData[predictors].head())
print()
print(testData[predictors].head())
#%% [markdown]
# <h4>Regularisation using $L_1$ or $L_2$</h4>
#
# You can perform regularised logistic regression by specifying two arguments in the function call:
#
# - penalty: this takes values **l1** for lasso and **l2** for ridge regression
# - C: this is the inverse of regularisation parameter alpha or lambda. smaller values specify stronger regularisation
#
# *You can refer to the documentation for more detailed information*
#
# Let’s try with $\lambda = 0.1$. So we have to set $C = \frac{1}{0.1}$
#%%
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#Define hyperparameters
lamdaValue = 0.1
C = 1/lamdaValue
#Fit model
logRegModel = LogisticRegression(C = C, penalty = 'l2')
logRegModel.fit(trainData[predictors], trainData[response])
yPredictLogRidge = logRegModel.predict(testData[predictors])
#%%
#Evaluate the model performance
logRegAcc = accuracy_score(yPredictLogRidge, testData[response])
print('Model Accuracy is: {}'.format(logRegAcc))
print("Model Coeff: {}".format(np.append(logRegModel.intercept_, logRegModel.coef_)))
#%% [markdown]
# Now as an exercise let’s do the following. For a list of l1 and l2 penalty scores, lets calculate the average accuracy over 500 runs of L1 and L2 regularised Logistic regression.
#
# Let’s begin by defining a function that takes in data and penalty types and values. For a fixed number of runs (‘trials’), the data is randomly split 70/30 as train/test. The average test accuracy is calculated.
#%%
def runLogRegModel(trials, data, predictors, label, penType, penScore):
#define variables
modelAcc = 0
modelWeight = np.zeros([1,3])
#For loop
for i in range(0,trials):
#Fit Model to Training data
trainData, testData = train_test_split(data, test_size = 0.3)
logRegModel = LogisticRegression(C = 1/penScore, penalty = penType)
logRegModel.fit(trainData[predictors], trainData[label])
yPredict = logRegModel.predict(testData[predictors])
#Evaluate Model Accuracy
modelAcc += accuracy_score(yPredict, testData[label]) #appends scores
modelWeight += np.append(logRegModel.intercept_, logRegModel.coef_)
#Average scores
modelAcc /= trials
modelWeight /= trials
#Function will Return model accuracy and weights
return np.round(modelAcc, decimals = 2), np.round(modelWeight, decimals = 2)
#%% [markdown]
# <h5>$L_2$ Regulisation</h5>
#
# Using the above function lets now try to find the best **lambda** value from 500 random splits of our data:
#%%
#Define set of lambda values to iterate over
lambdaValues = [.0001,.0003,.001,.003,.01,.03,.1,.3,1,3,5,10]
l2Acc = np.zeros(len(lambdaValues))
index = 0
#L2 reg
for i in lambdaValues:
l2Acc[index], w = runLogRegModel(500, examData, examPredictors, 'Admit', 'l2', np.float(i))
index += 1
#Print accuracy for each lambda
print("Acc: {}".format(l2Acc))
#Penalty at which validation accuracy is max
maxAccIndexL2 = np.argmax(l2Acc)
#Find corresponsing optimal lambda value
optiLambda = lambdaValues[maxAccIndexL2]
print("Optimal Lambda value: {}".format(optiLambda))
#%% [markdown]
# <h5>$L_1$ Regulisation</h5>
#
# Using the above function lets now try to find the best **alpha** value from 500 random splits of our data:
#%%
#Define set of lambda values to iterate over
alphaValues = [.0001,.0003,.001,.003,.01,.03,.1,.3,1,3,5,10]
l1Acc = np.zeros(len(alphaValues))
index = 0
#L1 reg
for i in alphaValues:
l1Acc[index], w = runLogRegModel(500, examData, examPredictors, 'Admit', 'l1', np.float(i))
index += 1
#Print accuracy for each alpha
print("Acc: {}".format(l1Acc))
#Penalty at which validation accuracy is max
maxAccIndexL1 = np.argmax(l1Acc)
#Find corresponsing optimal alpha value
optiAlpha = alphaValues[maxAccIndexL1]
print("Optimal Alpha value: {}".format(optiAlpha))
#%% [markdown]
# <h5>Comparing $L_2$ and $L_1$ metrics</h5>
#
# We now plot the average model accuracy with respect to the parameters alpha and lambda. Also,
# we mark the best values of alpha and lambda.
#%%
#plot the accuracy curve
plt.plot(range(0,len(lambdaValues)), l2Acc, color='b', label='L2')
plt.plot(range(0,len(alphaValues)), l1Acc, color='r', label='L1')
#replace the x-axis labels with penalty values
plt.xticks(range(0,len(lambdaValues)), lambdaValues, rotation='vertical')
#Highlight the best values of alpha and lambda
plt.plot((maxAccIndexL2, maxAccIndexL2), (0, l2Acc[maxAccIndexL2]), ls='dotted', color='b')
plt.plot((maxAccIndexL1, maxAccIndexL1), (0, l1Acc[maxAccIndexL1]), ls='dotted', color='r')
#Set the y-axis from 0 to 1.0
axes = plt.gca()
axes.set_ylim([0, 1.0])
plt.legend(loc="lower left")
plt.show()
#%%
weights = np.append(logRegModel.intercept_, logRegModel.coef_)
#%%
weights.shape
#%%
#iterate over items in a list
#check if df has variables of type np.float64 - bool returns TRUE or FALSE
[(e,type(e), isinstance(e, (int, np.float64))) for e in df]