-
Notifications
You must be signed in to change notification settings - Fork 284
/
Copy pathK Means Clustering.py
93 lines (66 loc) · 2.85 KB
/
K Means Clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#================================================================================================================
#----------------------------------------------------------------------------------------------------------------
# K MEANS CLUSTERING
#----------------------------------------------------------------------------------------------------------------
#================================================================================================================
# K means clustering is applied to normalized ipl player data
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
style.use('ggplot')
class K_Means:
def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
def fit(self, data):
self.centroids = {}
#initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
for i in range(self.k):
self.centroids[i] = data[i]
#begin iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] = []
#find the distance between the point and cluster; choose the nearest centroid
for features in data:
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
previous = dict(self.centroids)
#average the cluster datapoints to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)
isOptimal = True
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]
if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
isOptimal = False
#break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
if isOptimal:
break
def pred(self, data):
distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
return classification
def main():
df = pd.read_csv(r".\data\ipl.csv")
df = df[['one', 'two']]
dataset = df.astype(float).values.tolist()
X = df.values #returns a numpy array
km = K_Means(3)
km.fit(X)
# Plotting starts here
colors = 10*["r", "g", "c", "b", "k"]
for centroid in km.centroids:
plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")
for classification in km.classes:
color = colors[classification]
for features in km.classes[classification]:
plt.scatter(features[0], features[1], color = color,s = 30)
plt.show()
if __name__ == "__main__":
main()