From 57098a565bb2e111cddcf0a617b5abeacf75d53f Mon Sep 17 00:00:00 2001 From: Hakdag97 <72792786+Hakdag97@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:00:21 +0100 Subject: [PATCH 1/3] Created a test file mytest.py --- heat/cluster/mytest.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 heat/cluster/mytest.py diff --git a/heat/cluster/mytest.py b/heat/cluster/mytest.py new file mode 100644 index 000000000..30e40b186 --- /dev/null +++ b/heat/cluster/mytest.py @@ -0,0 +1,4 @@ +import heat as ht + +ht.use_device('gpu') +ht.zeros((3, 4,)) From 5b7a6e0409d410cab4abe0f07970560c1ae3604f Mon Sep 17 00:00:00 2001 From: Akdag Date: Mon, 16 Dec 2024 16:34:58 +0100 Subject: [PATCH 2/3] Implementation of parallel initialization --- heat/cluster/_kcluster.py | 158 +++++++++++++++------ heat/cluster/batchparallelclustering.py | 21 ++- heat/cluster/kmeans.py | 9 +- heat/cluster/kmedians.py | 11 +- heat/cluster/kmedoids.py | 9 +- heat/cluster/mytest.py | 180 +++++++++++++++++++++++- heat/cluster/tests/test_kmedoids.py | 5 +- heat/core/indexing.py | 3 +- 8 files changed, 333 insertions(+), 63 deletions(-) diff --git a/heat/cluster/_kcluster.py b/heat/cluster/_kcluster.py index c9505abf1..6029cc721 100644 --- a/heat/cluster/_kcluster.py +++ b/heat/cluster/_kcluster.py @@ -3,6 +3,8 @@ """ import heat as ht +import torch +from heat.cluster.batchparallelclustering import _kmex from typing import Optional, Union, Callable from heat.core.dndarray import DNDarray @@ -94,7 +96,9 @@ def functional_value_(self) -> DNDarray: """ return self._functional_value - def _initialize_cluster_centers(self, x: DNDarray): + def _initialize_cluster_centers( + self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20 + ): """ Initializes the K-Means centroids. @@ -102,6 +106,12 @@ def _initialize_cluster_centers(self, x: DNDarray): ---------- x : DNDarray The data to initialize the clusters for. Shape = (n_samples, n_features) + + oversampling : float + oversampling factor used in the k-means|| initializiation of centroids + + iter_multiplier : float + factor that increases the number of iterations used in the initialization of centroids """ # always initialize the random state if self.random_state is not None: @@ -123,53 +133,113 @@ def _initialize_cluster_centers(self, x: DNDarray): raise ValueError("passed centroids do not match cluster count or data shape") self._cluster_centers = self.init.resplit(None) - # Smart centroid guessing, random sampling with probability weight proportional to distance to existing centroids + # Parallelized centroid guessing using the k-means|| algorithm elif self.init == "probability_based": + # First, check along which axis the data is sliced if x.split is None or x.split == 0: - centroids = ht.zeros( - (self.n_clusters, x.shape[1]), split=None, device=x.device, comm=x.comm - ) - sample = ht.random.randint(0, x.shape[0] - 1).item() - _, displ, _ = x.comm.counts_displs_shape(shape=x.shape, axis=0) - proc = 0 - for p in range(x.comm.size): - if displ[p] > sample: - break - proc = p - x0 = ht.zeros(x.shape[1], dtype=x.dtype, device=x.device, comm=x.comm) - if x.comm.rank == proc: - idx = sample - displ[proc] - x0 = ht.array(x.lloc[idx, :], device=x.device, comm=x.comm) - x0.comm.Bcast(x0, root=proc) - centroids[0, :] = x0 - for i in range(1, self.n_clusters): - distances = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True) - D2 = distances.min(axis=1) - D2.resplit_(axis=None) - prob = D2 / D2.sum() - random_position = ht.random.rand() - sample = 0 - sum = 0 - for j in range(len(prob)): - if sum > random_position: - break - sum += prob[j].item() - sample = j - proc = 0 - for p in range(x.comm.size): - if displ[p] > sample: - break - proc = p - xi = ht.zeros(x.shape[1], dtype=x.dtype) - if x.comm.rank == proc: - idx = sample - displ[proc] - xi = ht.array(x.lloc[idx, :], device=x.device, comm=x.comm) - xi.comm.Bcast(xi, root=proc) - centroids[i, :] = xi - + # Define a list of random, uniformly distributed probabilities, which is later used to sample the centroids + sample = ht.random.rand(x.shape[0], split=x.split) + # Define a random integer serving as a label to pick the first centroid randomly + init_idx = ht.random.randint(0, x.shape[0] - 1).item() + # Randomly select first centroid and organize it as a tensor, in order to use the function cdist later. + # This tensor will be filled continously in the proceeding of this function + # We assume that the centroids fit into the memory of a single GPU + centroids = ht.expand_dims(x[init_idx, :].resplit_(None), axis=0) + # Calculate the initial cost of the clustering after the first centroid selection + # and use it as an indicator for the number of necessary iterations + # --> First calculate the Euclidean distance between data points x and initial centroids + # output format: tensor + init_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True) + # --> Pick the minimal distance of the data points to each centroid + # output format: vector + init_min_distance = init_distance.min(axis=1) + # --> Now calculate the cost + # output format: scalar + init_cost = init_min_distance.sum() + # Iteratively fill the tensor storing the centroids + for _ in ht.arange(0, iter_multiplier * ht.log(init_cost)): + # Calculate the distance between data points and the current set of centroids + distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True) + min_distance = distance.min(axis=1) + # Sample each point in the data to a new set of centroids + # --> probability distribution with oversampling factor + # output format: vector + prob = oversampling * min_distance / min_distance.sum() + # --> choose indices to sample the data according to prob + # output format: vector + idx = ht.where(sample <= prob) + # --> stack the data points with these indices to the DNDarray of centroids + # output format: tensor + """print(f"idx={idx}") + if idx.shape[0]!=0: + print(f"idx={idx}, idx.shape={idx.shape}, x[idx]={x[idx]}") + local_data= x[idx].resplit_(centroids.split) # make sure, that the data points we append to centroids are split in the same way + centroids=ht.row_stack((centroids,local_data)) """ + # print(f"x[idx]={x[idx]}, x[idx].shape={x[idx].shape}, process= {ht.MPI_WORLD.rank}\n") + # print(f"centroids.split={centroids.split}, process= {ht.MPI_WORLD.rank}\n") + # if idx.shape[0]!=0: + local_data = x[idx].resplit_( + centroids.split + ) # make sure, that the data points we append to centroids are split in the same way + # local_data=x[idx] + # print(f"x[1]={x[1]}, local_data={local_data}, process= {ht.MPI_WORLD.rank}\n") + centroids = ht.row_stack((centroids, local_data)) + # Evaluate distance between final centroids and data points + if centroids.shape[0] <= self.n_clusters: + raise ValueError( + "The oversampling factor and/or the number of iterations are chosen two small for the initialization of cluster centers." + ) + final_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True) + # For each data point in x, find the index of the centroid that is closest + final_idx = ht.argmin(final_distance, axis=1) + # Introduce weights, i.e., the number of data points closest to each centroid + # (count how often the same index in final_idx occurs) + weights = ht.zeros(centroids.shape[0], split=centroids.split) + for i in range(centroids.shape[0]): + weights[i] = ht.sum(final_idx == i) + # Recluster the oversampled centroids using standard k-means ++ (here we use the + # already implemented version in torch) + # --> first transform relevant arrays into torch tensors + centroids = centroids.resplit_(None) + centroids = centroids.larray + weights = weights.resplit_(None) + weights = weights.larray + # --> apply k-means ++ + if ht.MPI_WORLD.rank == 0: + batch_kmeans = _kmex( + centroids, + p=2, + n_clusters=self.n_clusters, + init="++", + max_iter=self.max_iter, + tol=self.tol, + random_state=None, + weights=weights, + ) + reclustered_centroids = batch_kmeans[0] # access the reclustered centroids + else: + # ensure that all processes have the same data + # tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory with the correct type (necessary for broadcast) + reclustered_centroids = torch.zeros( + (self.n_clusters, centroids.shape[1]), + dtype=x.dtype.torch_type(), + device=centroids.device, + ) + ht.MPI_WORLD.Bcast( + reclustered_centroids, root=0 + ) # by default it is broadcasted from process 0 + # ------------------------------------------------------------------------------- + # print(f"reclustered centroids in initilialize_cluster_centers (after applying kmex)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n") + # ------------------------------------------------------------------------------- + # --> transform back to DNDarray + reclustered_centroids = ht.array(reclustered_centroids, split=x.split) + # final result + self._cluster_centers = reclustered_centroids + # ------------------------------------------------------------------------------- + # print(f"reclustered centroids in initilialize_cluster_centers (final result)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n") + # ------------------------------------------------------------------------------- else: raise NotImplementedError("Not implemented for other splitting-axes") - self._cluster_centers = centroids elif self.init == "batchparallel": if x.split == 0: diff --git a/heat/cluster/batchparallelclustering.py b/heat/cluster/batchparallelclustering.py index 257b88c18..d6d756ef5 100644 --- a/heat/cluster/batchparallelclustering.py +++ b/heat/cluster/batchparallelclustering.py @@ -4,7 +4,8 @@ import heat as ht import torch -from heat.cluster._kcluster import _KCluster + +# from heat.cluster._kcluster import _KCluster from heat.core.dndarray import DNDarray from warnings import warn from math import log @@ -19,10 +20,14 @@ """ -def _initialize_plus_plus(X, n_clusters, p, random_state=None, max_samples=2**24 - 1): +def _initialize_plus_plus( + X, n_clusters, p, random_state=None, weights: torch.tensor = 1, max_samples=2**24 - 1 +): """ Auxiliary function: single-process k-means++/k-medians++ initialization in pytorch p is the norm used for computing distances + weights allows to add weights to the distribution function, so that the data points with higher weights are preferred; + note that weights must have the same dimension as X[0] The value max_samples=2**24 - 1 is necessary as PyTorchs multinomial currently only supports this number of different categories. """ @@ -37,11 +42,11 @@ def _initialize_plus_plus(X, n_clusters, p, random_state=None, max_samples=2**24 for i in range(1, n_clusters): dist = torch.cdist(X, X[idxs[:i]], p=p) dist = torch.min(dist, dim=1)[0] - idxs[i] = torch.multinomial(dist, 1) + idxs[i] = torch.multinomial(weights * dist, 1) return X[idxs] -def _kmex(X, p, n_clusters, init, max_iter, tol, random_state=None): +def _kmex(X, p, n_clusters, init, max_iter, tol, random_state=None, weights: torch.tensor = 1.0): """ Auxiliary function: single-process k-means and k-medians in pytorch p is the norm used for computing distances: p=2 implies k-means, p=1 implies k-medians. @@ -55,7 +60,7 @@ def _kmex(X, p, n_clusters, init, max_iter, tol, random_state=None): raise ValueError("if a torch tensor, init must have shape (n_clusters, n_features).") centers = init elif init == "++": - centers = _initialize_plus_plus(X, n_clusters, p, random_state) + centers = _initialize_plus_plus(X, n_clusters, p, random_state, weights) elif init == "random": idxs = torch.randint(0, X.shape[0], (n_clusters,)) centers = X[idxs] @@ -169,7 +174,7 @@ def functional_value_(self) -> float: """ return self._functional_value - def fit(self, x: DNDarray): + def fit(self, x: DNDarray, weights: torch.tensor = 1): """ Computes the centroid of the clustering algorithm to fit the data ``x``. @@ -178,6 +183,8 @@ def fit(self, x: DNDarray): x : DNDarray Training instances to cluster. Shape = (n_samples, n_features). It must hold x.split=0. + weights: torch.tensor + Add weights to the distribution function used in the clustering algorithm in kmex """ if not isinstance(x, DNDarray): raise TypeError(f"input needs to be a ht.DNDarray, but was {type(x)}") @@ -198,6 +205,7 @@ def fit(self, x: DNDarray): self.max_iter, self.tol, local_random_state, + weights, ) # hierarchical approach to obtail "global" cluster centers from the "local" centers @@ -233,6 +241,7 @@ def fit(self, x: DNDarray): self.max_iter, self.tol, local_random_state, + weights, ) del gathered_centers_local n_iters_local += n_iters_local_new diff --git a/heat/cluster/kmeans.py b/heat/cluster/kmeans.py index 96067aa82..9a247bc42 100644 --- a/heat/cluster/kmeans.py +++ b/heat/cluster/kmeans.py @@ -102,7 +102,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray): return new_cluster_centers - def fit(self, x: DNDarray) -> self: + def fit(self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20) -> self: """ Computes the centroid of a k-means clustering. @@ -111,13 +111,18 @@ def fit(self, x: DNDarray) -> self: x : DNDarray Training instances to cluster. Shape = (n_samples, n_features) + oversampling : float + oversampling factor used for the k-means|| initializiation of centroids + + iter_multiplier : float + factor that increases the number of iterations used in the initialization of centroids """ # input sanitation if not isinstance(x, DNDarray): raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}") # initialize the clustering - self._initialize_cluster_centers(x) + self._initialize_cluster_centers(x, oversampling, iter_multiplier) self._n_iter = 0 # iteratively fit the points to the centroids diff --git a/heat/cluster/kmedians.py b/heat/cluster/kmedians.py index c7d991b1f..0bd2cbb66 100644 --- a/heat/cluster/kmedians.py +++ b/heat/cluster/kmedians.py @@ -65,6 +65,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray): ---------- x : DNDarray Input data + matching_centroids : DNDarray Array filled with indeces ``i`` indicating to which cluster ``ci`` each sample point in x is assigned @@ -103,7 +104,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray): return new_cluster_centers - def fit(self, x: DNDarray): + def fit(self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20): """ Computes the centroid of a k-medians clustering. @@ -111,13 +112,19 @@ def fit(self, x: DNDarray): ---------- x : DNDarray Training instances to cluster. Shape = (n_samples, n_features) + + oversampling : float + oversampling factor used in the k-means|| initializiation of centroids + + iter_multiplier : float + factor that increases the number of iterations used in the initialization of centroids """ # input sanitation if not isinstance(x, ht.DNDarray): raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}") # initialize the clustering - self._initialize_cluster_centers(x) + self._initialize_cluster_centers(x, oversampling, iter_multiplier) self._n_iter = 0 # iteratively fit the points to the centroids diff --git a/heat/cluster/kmedoids.py b/heat/cluster/kmedoids.py index 0eb38a5eb..ec20dd24f 100644 --- a/heat/cluster/kmedoids.py +++ b/heat/cluster/kmedoids.py @@ -114,7 +114,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray): return new_cluster_centers - def fit(self, x: DNDarray): + def fit(self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20): """ Computes the centroid of a k-medoids clustering. @@ -122,13 +122,18 @@ def fit(self, x: DNDarray): ---------- x : DNDarray Training instances to cluster. Shape = (n_samples, n_features) + oversampling : float + oversampling factor used in the k-means|| initializiation of centroids + + iter_multiplier : float + factor that increases the number of iterations used in the initialization of centroids """ # input sanitation if not isinstance(x, DNDarray): raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}") # initialize the clustering - self._initialize_cluster_centers(x) + self._initialize_cluster_centers(x, oversampling, iter_multiplier) self._n_iter = 0 # iteratively fit the points to the centroids diff --git a/heat/cluster/mytest.py b/heat/cluster/mytest.py index 30e40b186..6a5783fa0 100644 --- a/heat/cluster/mytest.py +++ b/heat/cluster/mytest.py @@ -1,4 +1,180 @@ +""" +Some tests to check the funtionality of the k-means clustering algortihm +""" + import heat as ht +import numpy as np +import torch +import time + +ht.use_device("gpu") +# Convert data into DNDarrays +# The shape of this data is (3,5), i.e., +# 3 data points, each consisting of 5 features +x = [[1, 2, 3, 4, 5], [10, 20, 30, 40, 50], [0, 2, 3, 4, 4]] +unit = ht.ones((3, 5), split=None) +unitvector = ht.ones((1, 5), split=None) +v = [[20, 30, 40, 5, 6], [11, 22, 33, 44, 55], [102, 204, 303, 406, 507], [30, 44, 53, 66, 77]] +y = ht.array(x) +w = ht.array(v) +# Split the data along different axes +y0 = ht.array(x, split=0) +y1 = ht.array(x, split=1) +# Convert data, labels, and centers from heat tensors to numpy arrays +# larray +y_as_np = y0.resplit_(None).larray.cpu().numpy() +# output the shape +y_shape0 = y0.shape +# print the number of features in each data point +n_features = y0.shape[1] +# calculate Euclidean distance between each +# row-vector in y and w +# !!! Important !!! +# ---> the arguments of cdist must be 2D tensors, i.e., ht.array([[1,2,3]]) instead of ht.array([1,2,3]) +dist = ht.spatial.distance.cdist(y, w) +# pick the minimum value of a tensor along the axis=1 +min_dist = dist.min(axis=0) +# define a tensor with the same dimension as y and fill it with zeros +centroids = ht.zeros((y.shape[0], y.shape[1])) +# replace the 0th row vector of "centroids" by a randomly chosen row vector of y +sample = ht.random.randint(0, y.shape[0] - 1).item() +centroids[0, :] = y[sample] +# Useful for degubbing: keep track auf matrix shapes and the process (i.e., the gpu) the data is assigned to +print(f"centroids.shape{centroids.shape}, process= {ht.MPI_WORLD.rank}\n") +# stack two vectors together +# a=ht.array([1,2,3,4]) +# b=ht.array([10,20,30,40]) +# a=ht.array(2) +# b=ht.array(3) +# stacked_ab=ht.stack((a,b),axis=0) +# add dimensions +a_vector = ht.array([1, 2, 3, 4]) +new_x = ht.expand_dims(a_vector, axis=0) # output: [[1,2,3,4]] +# stack two vectors together and flatten, so that the outcome is similar to the command "append" +a = ht.array([[1, 2, 3, 4], [1, 5, 3, 4], [1, 2, 3, 42]]) +# b=ht.array([[10,20,30,40],[10,20,30,40],[1,2,3,4]]) +# stacked_ab=ht.stack((a,b),axis=0) +# reshaped_stacked_ab=ht.reshape(stacked_ab,(stacked_ab.shape[0]*stacked_ab.shape[1],stacked_ab.shape[2])) +b = ht.array([[10, 20, 30, 40], [10, 20, 30, 40]]) +stacked_ab = ht.row_stack((a, b)) +# create random numbers between 0 and 1 +random = ht.random.rand(y.shape[0]) +# translate into a uniform probability distribution +random_prob = random / random.sum() +# find the indices for which the condition test1 First calculate the Euclidean distance between data points x and initial centroids - # output format: tensor + # and use it as an indicator for the order of magnitude for the number of necessary iterations init_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True) + # --> init_distance calculates the Euclidean distance between data points x and initial centroids + # output format: tensor + init_min_distance = init_distance.min(axis=1) # --> Pick the minimal distance of the data points to each centroid # output format: vector - init_min_distance = init_distance.min(axis=1) + init_cost = init_min_distance.sum() # --> Now calculate the cost # output format: scalar - init_cost = init_min_distance.sum() + # # Iteratively fill the tensor storing the centroids for _ in ht.arange(0, iter_multiplier * ht.log(init_cost)): # Calculate the distance between data points and the current set of centroids distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True) min_distance = distance.min(axis=1) # Sample each point in the data to a new set of centroids + prob = oversampling * min_distance / min_distance.sum() # --> probability distribution with oversampling factor # output format: vector - prob = oversampling * min_distance / min_distance.sum() + idx = ht.where(sample <= prob) # --> choose indices to sample the data according to prob # output format: vector - idx = ht.where(sample <= prob) + local_data = x[idx].resplit_(centroids.split) + # --> pick the data points that are identified as possible centroids and make sure + # that data points and centroids are split in the same way + # output format: vector + centroids = ht.row_stack((centroids, local_data)) # --> stack the data points with these indices to the DNDarray of centroids # output format: tensor - """print(f"idx={idx}") - if idx.shape[0]!=0: - print(f"idx={idx}, idx.shape={idx.shape}, x[idx]={x[idx]}") - local_data= x[idx].resplit_(centroids.split) # make sure, that the data points we append to centroids are split in the same way - centroids=ht.row_stack((centroids,local_data)) """ - # print(f"x[idx]={x[idx]}, x[idx].shape={x[idx].shape}, process= {ht.MPI_WORLD.rank}\n") - # print(f"centroids.split={centroids.split}, process= {ht.MPI_WORLD.rank}\n") - # if idx.shape[0]!=0: - local_data = x[idx].resplit_( - centroids.split - ) # make sure, that the data points we append to centroids are split in the same way - # local_data=x[idx] - # print(f"x[1]={x[1]}, local_data={local_data}, process= {ht.MPI_WORLD.rank}\n") - centroids = ht.row_stack((centroids, local_data)) # Evaluate distance between final centroids and data points if centroids.shape[0] <= self.n_clusters: raise ValueError( - "The oversampling factor and/or the number of iterations are chosen two small for the initialization of cluster centers." + "The oversampling factor and/or the number of iterations are chosen" + "too small for the initialization of cluster centers." ) + # Evaluate the distance between data and the final set of centroids for the initialization final_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True) # For each data point in x, find the index of the centroid that is closest final_idx = ht.argmin(final_distance, axis=1) @@ -199,12 +194,11 @@ def _initialize_cluster_centers( weights[i] = ht.sum(final_idx == i) # Recluster the oversampled centroids using standard k-means ++ (here we use the # already implemented version in torch) - # --> first transform relevant arrays into torch tensors centroids = centroids.resplit_(None) centroids = centroids.larray weights = weights.resplit_(None) weights = weights.larray - # --> apply k-means ++ + # --> first transform relevant arrays into torch tensors if ht.MPI_WORLD.rank == 0: batch_kmeans = _kmex( centroids, @@ -216,28 +210,27 @@ def _initialize_cluster_centers( random_state=None, weights=weights, ) - reclustered_centroids = batch_kmeans[0] # access the reclustered centroids + # --> apply standard k-means ++ + # Note: as we only recluster the centroids for initialization with standard k-means ++, + # this list of centroids can also be used to initialize k-medians and k-medoids + reclustered_centroids = batch_kmeans[0] + # --> access the reclustered centroids else: # ensure that all processes have the same data - # tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory with the correct type (necessary for broadcast) reclustered_centroids = torch.zeros( (self.n_clusters, centroids.shape[1]), dtype=x.dtype.torch_type(), device=centroids.device, ) + # --> tensor with zeros that has the same size as reclustered centroids, in order to to + # allocate memory with the correct type in all processes(necessary for broadcast) ht.MPI_WORLD.Bcast( reclustered_centroids, root=0 ) # by default it is broadcasted from process 0 - # ------------------------------------------------------------------------------- - # print(f"reclustered centroids in initilialize_cluster_centers (after applying kmex)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n") - # ------------------------------------------------------------------------------- - # --> transform back to DNDarray reclustered_centroids = ht.array(reclustered_centroids, split=x.split) - # final result + # --> transform back to DNDarray self._cluster_centers = reclustered_centroids - # ------------------------------------------------------------------------------- - # print(f"reclustered centroids in initilialize_cluster_centers (final result)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n") - # ------------------------------------------------------------------------------- + # --> final result for initialized cluster centers else: raise NotImplementedError("Not implemented for other splitting-axes") diff --git a/heat/cluster/mytest.py b/heat/cluster/mytest.py deleted file mode 100644 index 6a5783fa0..000000000 --- a/heat/cluster/mytest.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -Some tests to check the funtionality of the k-means clustering algortihm -""" - -import heat as ht -import numpy as np -import torch -import time - -ht.use_device("gpu") -# Convert data into DNDarrays -# The shape of this data is (3,5), i.e., -# 3 data points, each consisting of 5 features -x = [[1, 2, 3, 4, 5], [10, 20, 30, 40, 50], [0, 2, 3, 4, 4]] -unit = ht.ones((3, 5), split=None) -unitvector = ht.ones((1, 5), split=None) -v = [[20, 30, 40, 5, 6], [11, 22, 33, 44, 55], [102, 204, 303, 406, 507], [30, 44, 53, 66, 77]] -y = ht.array(x) -w = ht.array(v) -# Split the data along different axes -y0 = ht.array(x, split=0) -y1 = ht.array(x, split=1) -# Convert data, labels, and centers from heat tensors to numpy arrays -# larray -y_as_np = y0.resplit_(None).larray.cpu().numpy() -# output the shape -y_shape0 = y0.shape -# print the number of features in each data point -n_features = y0.shape[1] -# calculate Euclidean distance between each -# row-vector in y and w -# !!! Important !!! -# ---> the arguments of cdist must be 2D tensors, i.e., ht.array([[1,2,3]]) instead of ht.array([1,2,3]) -dist = ht.spatial.distance.cdist(y, w) -# pick the minimum value of a tensor along the axis=1 -min_dist = dist.min(axis=0) -# define a tensor with the same dimension as y and fill it with zeros -centroids = ht.zeros((y.shape[0], y.shape[1])) -# replace the 0th row vector of "centroids" by a randomly chosen row vector of y -sample = ht.random.randint(0, y.shape[0] - 1).item() -centroids[0, :] = y[sample] -# Useful for degubbing: keep track auf matrix shapes and the process (i.e., the gpu) the data is assigned to -print(f"centroids.shape{centroids.shape}, process= {ht.MPI_WORLD.rank}\n") -# stack two vectors together -# a=ht.array([1,2,3,4]) -# b=ht.array([10,20,30,40]) -# a=ht.array(2) -# b=ht.array(3) -# stacked_ab=ht.stack((a,b),axis=0) -# add dimensions -a_vector = ht.array([1, 2, 3, 4]) -new_x = ht.expand_dims(a_vector, axis=0) # output: [[1,2,3,4]] -# stack two vectors together and flatten, so that the outcome is similar to the command "append" -a = ht.array([[1, 2, 3, 4], [1, 5, 3, 4], [1, 2, 3, 42]]) -# b=ht.array([[10,20,30,40],[10,20,30,40],[1,2,3,4]]) -# stacked_ab=ht.stack((a,b),axis=0) -# reshaped_stacked_ab=ht.reshape(stacked_ab,(stacked_ab.shape[0]*stacked_ab.shape[1],stacked_ab.shape[2])) -b = ht.array([[10, 20, 30, 40], [10, 20, 30, 40]]) -stacked_ab = ht.row_stack((a, b)) -# create random numbers between 0 and 1 -random = ht.random.rand(y.shape[0]) -# translate into a uniform probability distribution -random_prob = random / random.sum() -# find the indices for which the condition test1