-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster2.py
53 lines (43 loc) · 2.11 KB
/
cluster2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# Assume you have a DataFrame `df` with `genre` and feature columns
# Example features: danceability, energy, instrumentalness, etc.
FEATURES = ["danceability", "energy", "acousticness", "loudness"]
# Function to apply k-means clustering and evaluate results
def test_clustering(df, features, n_clusters):
X = df[features].values
# Apply K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)
# Calculate silhouette score to evaluate clustering quality
silhouette_avg = silhouette_score(X, cluster_labels)
print(f"Silhouette score for features {features}: {silhouette_avg}")
# Visualize clusters using PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=cluster_labels, palette="viridis", s=60)
plt.title(f"K-means Clustering with features {features}")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.show()
# Return clustering labels and silhouette score for further analysis
return cluster_labels, silhouette_avg
# Test clustering with different feature combinations
n_clusters = len(df['genre'].unique()) # Number of genres as number of clusters
results = {}
for r in range(1, len(FEATURES) + 1):
for feature_combination in itertools.combinations(FEATURES, r):
# Run clustering and save results
labels, silhouette_avg = test_clustering(df, list(feature_combination), n_clusters)
results[feature_combination] = silhouette_avg
# Print the best feature combination for clustering
best_features = max(results, key=results.get)
print(f"Best feature combination for clustering: {best_features}")
print(f"Highest silhouette score: {results[best_features]}")