-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
75 lines (64 loc) · 2.41 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Load and prepare the data
data = pd.read_csv("song_data.csv")
feats = data[['danceability', 'energy', 'loudness',"acousticness","instrumentalness","tempo","valence","key"]].copy()
# Define the number of clusters
k = 6
# Scale the features
def scale(column):
feats[column] = (feats[column] - feats[column].min()) / (feats[column].max() - feats[column].min()) * 10
for column in feats.columns:
scale(column)
# Initialize centroids
centroids = feats.sample(n=k).values
print("Initial centroids:", centroids)
# Function to assign clusters
def assign_clusters(centroids):
assignments = []
for _, row in feats.iterrows():
distances = [np.sqrt(np.sum(np.square(centroid - row.values))) for centroid in centroids]
closest_centroid = np.argmin(distances)
assignments.append(closest_centroid)
return np.array(assignments)
# Function to recalculate centroids
def recalculate_centroids(assignments, k):
new_centroids = []
for i in range(k):
cluster_points = feats[np.array(assignments) == i]
if len(cluster_points) > 0:
new_centroid = cluster_points.mean(axis=0)
new_centroids.append(new_centroid.values)
else:
# If a cluster has no points, reinitialize to a random point
new_centroids.append(feats.sample(n=1).values[0])
return np.array(new_centroids)
# Perform the k-means algorithm
num_iterations = 10
for _ in range(num_iterations):
assignments = assign_clusters(centroids)
centroids = recalculate_centroids(assignments, k)
# Adding cluster assignments to the dataframe
feats['cluster'] = assignments
# Visualize the clusters in 3D (choose three features for plotting)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(feats['danceability'], feats['acousticness'], feats['loudness'], c=feats['cluster'], cmap='viridis')
ax.set_xlabel('Dance')
ax.set_ylabel('Acousticness')
ax.set_zlabel('Loudness')
plt.show()
error = 0
for x in range(100):
if (1-assignments[x]) ** 2 !=0:
error += 1
for x in range(100,200):
if (2 - assignments[x]) ** 2 != 0:
error += 1
for x in range(200,300):
if (0 - assignments[x]) ** 2 != 0:
error += 1
print(np.sum(np.abs(error)))
print(assignments)