-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclustering_analysis_experiments.py
104 lines (88 loc) · 4.85 KB
/
clustering_analysis_experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#########################################################
## VISIBILIA
## Project: SIION
## Year: 2018-2019
## Members:
## Vinicius Ferreira @viniciusferreiradasilva
## Alan Valejo @alanvalejo
## Paola Valdivia @paolavaldivia
## Jorge Valverde-Rebaza @jvalverr
##
## Site: http://visibilia.net.br/siion-resources-bracis-2019/
##
#########################################################
from __future__ import print_function
from sklearn.metrics import silhouette_samples, silhouette_score
from collections import Counter
from preprocessing.file_loading import json_2_dataframe
from preprocessing.file_loading import load_attribute
from algorithms.clustering import highest_attribute_value
from algorithms.clustering import kmeans
from algorithms.clustering import dbscan
from algorithms.clustering import agglomerative_clustering
from utils.utils import str2bool
import numpy as np
import argparse
# Instantiate the parser
parser = argparse.ArgumentParser(description='Map segmentation tool.', formatter_class=argparse.RawTextHelpFormatter)
# Required input_file name argument.
parser.add_argument('--input_file', required=True, type=str,
help='A string representing a .json input file path with latitude and longitude columns.')
# Optional output_file dir argument.
parser.add_argument('--output_dir', type=str, help="A string representing an output dir path to save the clustering"
" map, the resulting network .ncol file and the resulting embeddings"
, required=True)
# Required input_file name argument.
parser.add_argument('--map', required=True, type=str, default='false',
help='A string representing a .json input file path with latitude and longitude columns.')
# Required clustering algorithm argument.
parser.add_argument('--clustering_algorithm', type=int, required=True,
help='An integer representing the clustering algorithm that will be used to segment the map:\n'
'1 - Highest attribute. Args: attribute_name (str), threshold (float),\n'
'2 - K-means. Args: n_clusters (int),\n'
'3 - DBSCAN. Args eps (float), min_samples (int),\n'
'4 - Agglomerative clustering. Args: n_clusters (int)\n')
# Required clustering algorithm configs.
parser.add_argument('--configs', required=False, nargs='+', default=[],
help='List of configs that will be passed to the clustering algorithm. Each clustering algorithm '
'has a certain number of parameters. You should pass them in the same order as strings:\n'
'--configs "CONF_1", "CONF_2",... "CONF_N".')
args = parser.parse_args()
# Retrieve the configs from arguments.
configs = [list(map(lambda x: float(x) if x.isdigit() else x, x)) for x in list(map((lambda x: x.strip().split(' ')),
args.configs))]
# Loading a json file for a pandas dataframe.
print("loading data " + args.input_file)
df = json_2_dataframe(args.input_file)
# Loading check-in list to the dataframe.
df = load_attribute(df, 'data/input/filtered/checkins.json', 'business_id', 'checkins')
# List containing all the implemented clustering algorithms.
clustering_algorithms = [highest_attribute_value, kmeans, dbscan, agglomerative_clustering]
# Retrieve the matrix formed by latitude and longitude.
X = df[['latitude', 'longitude']].values
# It will be map plotting?
plot_map = str2bool(args.map)
# Var that save all silhouettes.
silhouettes = np.empty(len(configs))
ns_clusters = np.empty(len(configs))
for index, config in enumerate(configs):
print('config:', config)
# Clustering
cluster_labels = np.array(clustering_algorithms[args.clustering_algorithm](df, *config)['cluster_id'])
ns_clusters[index] = len(Counter(cluster_labels).keys())
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
# Tests if the current config is better than the current best.
silhouettes[index] = silhouette_avg
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
clustering_file = args.output_dir + args.input_file.split('/')[-1].split('.')[0] + '_' + clustering_algorithms[
args.clustering_algorithm].__name__ + '.csv'
print("saving clustering analysis into:", clustering_file)
# Writing the embedding into a file.
with open(clustering_file, 'w') as f:
for index, silhouette in enumerate(silhouettes):
f.write(','.join(map(str, configs[index])) + ',' + str(silhouette) + ',' + str(ns_clusters[index]) + '\n')
f.close()