-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExercise_1_5.py
109 lines (84 loc) · 3.7 KB
/
Exercise_1_5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 21 15:35:14 2020
@author: khurana
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
column_names = ["sex", "length", "diameter", "height", "whole weight",
"shucked weight", "viscera weight", "shell weight", "rings"]
data = pd.read_csv("Y:/Home/khurana/1.Scripts/ml_summerschool_092020/datasets/abalone.data", names=column_names)
print("Number of samples: %d" % len(data))
data.head()
y = data[["rings"]]
X = data.drop(["rings"], axis = 1, inplace = False)
plt.scatter(data["length"], data["shell weight"], c = data["rings"])
plt.xlabel ("Length")
plt.ylabel ("Shell Weight")
plt.title ("Variation of age with length and weight of the shell")
def linreg_singfeature (data, target, features, ratio):
X_temp = data[features]
# y_temp = list(target.values)
y_temp = target
# Split the data into training/testing sets
X_train = X_temp[:-ratio]
X_test = X_temp[-ratio:]
# Split the targets into training/testing sets
y_train = y_temp[:-ratio]
y_test = y_temp[-ratio:]
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_train, y_train)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
% np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
X_test.sort_values(by = ["length", "shell weight"], inplace = True)
y_pred = regr.predict(X_test)
return y_pred, y_test
predictions, truth = linreg_singfeature (X, y, ["length", "shell weight"], 210)
#Compare prediction and truth
compy = list(range(int(np.round(min([np.min(predictions), truth.values.min()]),0))-1, int(np.round(max([np.max(predictions), truth.values.max()]),0))+1))
plt.scatter(predictions, truth)
plt.plot(compy, compy, c = "red")
plt.ylabel ("Truth")
plt.xlabel ("Prediction")
#K nearest neighbors
def run_knn (data, target, features, neardatapoints, weight, ratio):
neigh = KNeighborsRegressor(n_neighbors=neardatapoints, weights = weight)
X_temp = data[features]
y_temp = target
X_train = X_temp[:-ratio]
X_test = X_temp[-ratio:]
# Split the targets into training/testing sets
y_train = y_temp[:-ratio]
y_test = y_temp[-ratio:]
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test.sort_values(by = [features[0]]))
#Plotting
# plt.figure()
# plt.title ("k-Nearest neighbors classification using distance weight and: " + str(neardatapoints))
# plt.scatter(X_train[features[0]], y_train, s = 30, marker = "^", c = "grey", alpha = 0.2, label = "Training")
# plt.scatter(X_test[features[0]], y_pred, s = 20, c='blue', label = "Prediction")
# plt.scatter(X_test[features[0]], y_test, s = 30, c = "black", label = "Test")
# plt.xlabel (features[0])
# plt.ylabel ("Target")
# plt.legend()
# plt.show()
return y_pred, y_test
for n in [7, 100, 1000]:
predictions, truth = run_knn(X, y, ["length", "shell weight"], n, "distance", 210)
compy = list(range(int(np.round(min([np.min(predictions), truth.values.min()]),0))-1, int(np.round(max([np.max(predictions), truth.values.max()]),0))+1))
plt.figure()
plt.scatter(predictions, truth)
plt.plot(compy, compy, c = "red")
plt.ylabel ("Truth")
plt.xlabel ("Prediction")