-
Notifications
You must be signed in to change notification settings - Fork 16
/
fastcluster.cpp
169 lines (151 loc) · 4.78 KB
/
fastcluster.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
//
// C++ standalone verion of fastcluster by Daniel Müllner
//
// Copyright: Christoph Dalitz, 2020
// Daniel Müllner, 2011
// License: BSD style license
// (see the file LICENSE for details)
//
#include <vector>
#include <algorithm>
#include "fastcluster.h"
// Code by Daniel Müllner
// workaround to make it usable as a standalone version (without R)
bool fc_isnan(double x) { return x != x; }
#include "fastcluster_dm.cpp"
#include "fastcluster_R_dm.cpp"
//
// Assigns cluster labels (0, ..., nclust-1) to the n points such
// that the cluster result is split into nclust clusters.
//
// Input arguments:
// n = number of observables
// merge = clustering result in R format
// nclust = number of clusters
// Output arguments:
// labels = allocated integer array of size n for result
//
void cutree_k(int n, const int* merge, int nclust, int* labels) {
int k,m1,m2,j,l;
if (nclust > n || nclust < 2) {
for (j=0; j<n; j++) labels[j] = 0;
return;
}
// assign to each observable the number of its last merge step
// beware: indices of observables in merge start at 1 (R convention)
std::vector<int> last_merge(n, 0);
for (k=1; k<=(n-nclust); k++) {
// (m1,m2) = merge[k,]
m1 = merge[k-1];
m2 = merge[n-1+k-1];
if (m1 < 0 && m2 < 0) { // both single observables
last_merge[-m1-1] = last_merge[-m2-1] = k;
}
else if (m1 < 0 || m2 < 0) { // one is a cluster
if(m1 < 0) { j = -m1; m1 = m2; } else j = -m2;
// merging single observable and cluster
for(l = 0; l < n; l++)
if (last_merge[l] == m1)
last_merge[l] = k;
last_merge[j-1] = k;
}
else { // both cluster
for(l=0; l < n; l++) {
if( last_merge[l] == m1 || last_merge[l] == m2 )
last_merge[l] = k;
}
}
}
// assign cluster labels
int label = 0;
std::vector<int> z(n,-1);
for (j=0; j<n; j++) {
if (last_merge[j] == 0) { // still singleton
labels[j] = label++;
} else {
if (z[last_merge[j]] < 0) {
z[last_merge[j]] = label++;
}
labels[j] = z[last_merge[j]];
}
}
}
//
// Assigns cluster labels (0, ..., nclust-1) to the n points such
// that the hierarchical clustering is stopped when cluster distance >= cdist
//
// Input arguments:
// n = number of observables
// merge = clustering result in R format
// height = cluster distance at each merge step
// cdist = cutoff cluster distance
// Output arguments:
// labels = allocated integer array of size n for result
//
void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels) {
int k;
for (k=0; k<(n-1); k++) {
if (height[k] >= cdist) {
break;
}
}
cutree_k(n, merge, n-k, labels);
}
//
// Hierarchical clustering with one of Daniel Muellner's fast algorithms
//
// Input arguments:
// n = number of observables
// distmat = condensed distance matrix, i.e. an n*(n-1)/2 array representing
// the upper triangle (without diagonal elements) of the distance
// matrix, e.g. for n=4:
// d00 d01 d02 d03
// d10 d11 d12 d13 -> d01 d02 d03 d12 d13 d23
// d20 d21 d22 d23
// d30 d31 d32 d33
// method = cluster metric (see enum hclust_fast_methods)
// Output arguments:
// merge = allocated (n-1)x2 matrix (2*(n-1) array) for storing result.
// Result follows R hclust convention:
// - observabe indices start with one
// - merge[i][] contains the merged nodes in step i
// - merge[i][j] is negative when the node is an atom
// height = allocated (n-1) array with distances at each merge step
// Return code:
// 0 = ok
// 1 = invalid method
//
int hclust_fast(int n, double* distmat, int method, int* merge, double* height) {
// call appropriate culstering function
cluster_result Z2(n-1);
if (method == HCLUST_METHOD_SINGLE) {
// single link
MST_linkage_core(n, distmat, Z2);
}
else if (method == HCLUST_METHOD_COMPLETE) {
// complete link
NN_chain_core<METHOD_METR_COMPLETE, t_float>(n, distmat, NULL, Z2);
}
else if (method == HCLUST_METHOD_AVERAGE) {
// best average distance
double* members = new double[n];
for (int i=0; i<n; i++) members[i] = 1;
NN_chain_core<METHOD_METR_AVERAGE, t_float>(n, distmat, members, Z2);
delete[] members;
}
else if (method == HCLUST_METHOD_MEDIAN) {
// best median distance (beware: O(n^3))
generic_linkage<METHOD_METR_MEDIAN, t_float>(n, distmat, NULL, Z2);
}
else {
return 1;
}
int* order = new int[n];
if (method == HCLUST_METHOD_MEDIAN) {
generate_R_dendrogram<true>(merge, height, order, Z2, n);
} else {
generate_R_dendrogram<false>(merge, height, order, Z2, n);
}
delete[] order; // only needed for visualization
return 0;
}