Skip to content

Commit

Permalink
[Fix] SCHC clustering with ward linkage discrepancy
Browse files Browse the repository at this point in the history
Signed-off-by: Xun Li <[email protected]>
  • Loading branch information
lixun910 committed Jan 24, 2025
1 parent c175643 commit 66dbcfe
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 19 deletions.
1 change: 1 addition & 0 deletions clustering/redcap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ void Tree::Split(int orig, int dest, boost::unordered_map<int, std::vector<int>
bool Tree::checkControl(std::vector<int>& cand_ids, std::vector<int>& ids, int flag)
{
if (controls == NULL) {
std::cout << "controls is NULL" << std::endl;
return true;
}

Expand Down
13 changes: 9 additions & 4 deletions clustering/redcap_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ redcap_wrapper::redcap_wrapper(unsigned int k,
double *_bound_vals = 0;
if ((int)bound_vals.size() == num_obs) {
_bound_vals = new double[num_obs];
for (int i = 0; i < num_obs; ++i) _bound_vals[i] = bound_vals[i];
for (int i = 0; i < num_obs; ++i) {
_bound_vals[i] = bound_vals[i];
}
}
// get distance matrix
int n_cols = data.size();
Expand All @@ -56,9 +58,12 @@ redcap_wrapper::redcap_wrapper(unsigned int k,
for (int i=0; i<n_cols; ++i) weight[i] = 1.0;

// lower triangle distance matrix
double** distances = dist_matrix;
if (!distances) distances = distancematrix(num_obs, n_cols, matrix, mask, weight, dist, transpose);
//double** distances = DataUtils::fullRaggedMatrix(ragged_distances, num_obs, num_obs);
double** ragged_distances = dist_matrix;
if (!ragged_distances) ragged_distances = distancematrix(num_obs, n_cols, matrix, mask, weight, dist, transpose);

// convert ragged distance matrix to full distance matrix
bool isSqrt = redcap_method == 2 ? true : false; // sqrt for average linkage
double** distances = DataUtils::fullRaggedMatrix(ragged_distances, num_obs, num_obs, isSqrt);

// call redcap
std::vector<bool> undefs(num_obs, false); // not used
Expand Down
23 changes: 16 additions & 7 deletions clustering/schc_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,16 @@ schc_wrapper::schc_wrapper(unsigned int k,
for (int i=0; i<n_cols; ++i) weight[i] = 1.0;

// lower triangular half of the distance matrix
double** distances = dist_matrix;
if (!distances) distances = distancematrix(num_obs, n_cols, matrix, mask, weight, dist, transpose);
//double** distances = DataUtils::fullRaggedMatrix(ragged_distances, num_obs, num_obs);
double** ragged_distances = dist_matrix;
if (!ragged_distances) ragged_distances = distancematrix(num_obs, n_cols, matrix, mask, weight, dist, transpose);

// convert ragged distance matrix to full distance matrix
bool isSqrt = method == 2 ? true : false; // sqrt for average linkage
double** distances = DataUtils::fullRaggedMatrix(ragged_distances, num_obs, num_obs, isSqrt);

// free ragged distance matrix
for (int i = 1; i < num_obs; i++) free(ragged_distances[i]);
free(ragged_distances);

// call redcap
std::vector<bool> undefs(num_obs, false); // not used
Expand Down Expand Up @@ -109,24 +116,26 @@ schc_wrapper::schc_wrapper(unsigned int k,
}
}

std::vector<int> clusters;
int* clusterid = new int[num_obs];

cuttree (num_obs, htree, k, clusterid);

delete[] htree;
double cut_distance = cuttree (num_obs, htree, k, clusterid);

std::vector<int> clusters;
for (int i=0; i<num_obs; i++) {
clusters.push_back(clusterid[i]+1);
}
delete[] clusterid;
clusterid = NULL;
delete[] htree;
htree = NULL;

// sort result
cluster_ids.resize(k);
for (int i=0; i < clusters.size(); i++) {
cluster_ids[ clusters[i] - 1 ].push_back(i);
}
std::sort(cluster_ids.begin(), cluster_ids.end(),
GenUtils::less_vectors);
}

if (weight) delete[] weight;
Expand Down
22 changes: 14 additions & 8 deletions gda_clustering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,9 @@ double gda_totalsumofsquare(const std::vector<std::vector<double> >& vals)
std::vector<double> gda_withinsumofsquare(const std::vector<std::vector<int> >& solution,
const std::vector<std::vector<double> >& _data)
{
// solution is a list of lists of region ids [[1,7,2],[0,4,3],...] such
// that the first solution has areas 1,7,2 the second solution 0,4,3 and so
// on. cluster_ids does not have to be exhaustive
size_t cols = _data.size();

// standardize data
Expand All @@ -329,21 +332,24 @@ std::vector<double> gda_withinsumofsquare(const std::vector<std::vector<int> >&
}

std::vector<double> within_ss;
for (size_t c=0; c<cols; ++c) {
double ss = 0;
for (size_t i=0; i<solution.size(); ++i) {

// compute within sum of squares for each cluster
for (size_t i = 0; i < solution.size(); ++i) {
const std::vector<int>& ids = solution[i];
double ssq = 0;
for (size_t c = 0; c < cols; ++c) {
std::vector<double> vals;
for (size_t j = 0; j < solution[i].size(); ++j) {
size_t r = solution[i][j];
for (size_t j = 0; j < ids.size(); ++j) {
size_t r = ids[j];
vals.push_back(data[c][r]);
}
ss += gda_sumofsquares(vals);
ssq += gda_sumofsquares(vals);
}
within_ss.push_back(ss);
within_ss.push_back(ssq);
}

return within_ss;
}
}

double gda_totalwithinsumofsquare(const std::vector<std::vector<int> >& solution,
const std::vector<std::vector<double> >& _data)
Expand Down

0 comments on commit 66dbcfe

Please sign in to comment.