-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkmeans.py
155 lines (127 loc) · 5.3 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
import matplotlib.pyplot as plt
def euclDistance(vector1, vector2):
'''
计算平方欧几里德距离
:param vector1: 向量1
:param vector2: 向量2
:return: 两个向量之间的距离
'''
return np.sum(np.power(vector2 - vector1, 2)) # 对应分量求平方差后整体求和
def initCentroids(dataSet, k):
'''
用随机样本初始化质点
:param dataSet: 训练数据
:param k: 需要计算的分类数
:return: 随机初始化的k个质点
'''
numSamples, dim = dataSet.shape # 将训练数据看作矩阵,求出行数和列数
centroids = np.zeros((k, dim)) # 初始化一个 k*dim 的零矩阵
s = set() # python中的数据类型:集合
for i in range(k): # i 从 0 迭代到 k
while True:
index = int(np.random.uniform(0, numSamples)) # 将生成的随机数转换成整型
if index not in s: # 若集合 s 中不包含 index,则将其加入其中
s.add(index)
break
print("\trandom index: ", index)
centroids[i, :] = dataSet[index, :] # 将 dataSet 的第 index 行赋值给 centroids 的第 i 行
return np.mat(centroids)
def getCost(clusterAssment):
'''
获取cost,也就是误差平方和
:param clusterAssment: 用于存储聚类结果的矩阵
:return: cost
'''
len = clusterAssment.shape[0] # 获取聚类结果的行数,也就是样本的个数
Sum = 0.0
for i in range(len):
Sum = Sum + clusterAssment[i, 1]
return Sum
def kMeans(dataSet, k):
'''
k-means算法
:param dataSet: 训练数据
:param k: 类别个数
:return: 质点和聚类结果
'''
numSamples = dataSet.shape[0]
clusterAssment = np.mat(np.zeros((numSamples, 2))) # 第一列存这个样本点属于哪个簇,第二列存这个样本点与距离最近的质点之间的距离
for i in range(numSamples): # 初始化聚类结果矩阵
clusterAssment[i, 0] = -1
clusterChanged = True
cost = []
# step 1: 初始化 centroids 矩阵,随机获取k个质点
centroids = initCentroids(dataSet, k)
while clusterChanged: # 如果 kmeans 算法已经收敛,即质点不再变化,则 clusterChanged 的值为 False
clusterChanged = False
for i in range(numSamples): # 对于每个样本点
minDist = 10 ** 100 # 用于记录距离各个质点的最小距离
minIndex = -1 # 用于记录质点标号
# step 2: 找到距离最近的质点
for j in range(k): # 对于每个质点
distance = euclDistance(centroids[j], dataSet[i])
if distance < minDist:
minDist = distance
minIndex = j
# step 3: 更新样本点与质点的分配关系
if clusterAssment[i, 0] != minIndex: # kmeans 算法尚未收敛
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist
else: # 这个样本点的质点不变,只更新最短距离
clusterAssment[i, 1] = minDist
# step 4: 更新质点
for j in range(k):
pointsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0]]
centroids[j, :] = np.mean(pointsInCluster, axis=0)
# step 5: 获取 cost 并存储,用于可视化
cost.append(getCost(clusterAssment))
print("\tthe times of iteration: ", len(cost))
print("\tcost: ", cost)
return centroids, clusterAssment
def showCluster(dataSet, k, centroids, clusterAssment):
'''
以2D形式可视化数据
:param dataSet: 训练数据
:param k: 类别个数
:param centroids: 样本质点
:param clusterAssment: 聚类结果
:return: void
'''
numSamples, dim = dataSet.shape
if dim != 2:
print("\tSorry! I can not draw because the dimension of your data is not 2!")
return 1
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
if k > len(mark):
print("\tSorry! Your k is too large!")
return 1
# 绘制所有非中心样本点
for i in range(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex - 1])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
# 绘制质点
for i in range(1, k + 1):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i - 1], markersize=8)
plt.show()
# step 1: 载入数据
print("step 1: load data...")
dataSet = [] # 用于存储经过处理后的数据,用作训练 kmeans 算法
allDataSet = [] # 用于存储所有原始数据,元素为类型
with open('./wpbc.data.txt') as fileIn:
for line in fileIn.readlines():
line = line.strip().split(",")
allDataSet.append(line)
line = list(map(lambda x: float(x), line[2:])) # 将 list 中字符串元素变为 float 类型
dataSet.append(line)
dataSet = np.mat(dataSet) # 把 list 变换为 matrix
print("dataSet: \n", dataSet, "\n")
# step 2: 开始聚合...
print("step 2: clustering...")
k = 2 # 类别个数
centroids, clusterAssment = kMeans(dataSet, k)
# step 3: 显示结果
print("\nStep 3: show the outcome...")
print("centroids: \n", centroids)
print("\nclusterAssment: \n", clusterAssment)