From 57098a565bb2e111cddcf0a617b5abeacf75d53f Mon Sep 17 00:00:00 2001
From: Hakdag97 <72792786+Hakdag97@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:00:21 +0100
Subject: [PATCH 1/3] Created a test file mytest.py

---
 heat/cluster/mytest.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 heat/cluster/mytest.py

diff --git a/heat/cluster/mytest.py b/heat/cluster/mytest.py
new file mode 100644
index 000000000..30e40b186
--- /dev/null
+++ b/heat/cluster/mytest.py
@@ -0,0 +1,4 @@
+import heat as ht
+
+ht.use_device('gpu')
+ht.zeros((3, 4,))

From 5b7a6e0409d410cab4abe0f07970560c1ae3604f Mon Sep 17 00:00:00 2001
From: Akdag <akda_me@sc-030332l.intra.dlr.de>
Date: Mon, 16 Dec 2024 16:34:58 +0100
Subject: [PATCH 2/3] Implementation of parallel initialization

---
 heat/cluster/_kcluster.py               | 158 +++++++++++++++------
 heat/cluster/batchparallelclustering.py |  21 ++-
 heat/cluster/kmeans.py                  |   9 +-
 heat/cluster/kmedians.py                |  11 +-
 heat/cluster/kmedoids.py                |   9 +-
 heat/cluster/mytest.py                  | 180 +++++++++++++++++++++++-
 heat/cluster/tests/test_kmedoids.py     |   5 +-
 heat/core/indexing.py                   |   3 +-
 8 files changed, 333 insertions(+), 63 deletions(-)

diff --git a/heat/cluster/_kcluster.py b/heat/cluster/_kcluster.py
index c9505abf1..6029cc721 100644
--- a/heat/cluster/_kcluster.py
+++ b/heat/cluster/_kcluster.py
@@ -3,6 +3,8 @@
 """
 
 import heat as ht
+import torch
+from heat.cluster.batchparallelclustering import _kmex
 from typing import Optional, Union, Callable
 from heat.core.dndarray import DNDarray
 
@@ -94,7 +96,9 @@ def functional_value_(self) -> DNDarray:
         """
         return self._functional_value
 
-    def _initialize_cluster_centers(self, x: DNDarray):
+    def _initialize_cluster_centers(
+        self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20
+    ):
         """
         Initializes the K-Means centroids.
 
@@ -102,6 +106,12 @@ def _initialize_cluster_centers(self, x: DNDarray):
         ----------
         x : DNDarray
             The data to initialize the clusters for. Shape = (n_samples, n_features)
+
+        oversampling : float
+            oversampling factor used in the k-means|| initializiation of centroids
+
+        iter_multiplier : float
+            factor that increases the number of iterations used in the initialization of centroids
         """
         # always initialize the random state
         if self.random_state is not None:
@@ -123,53 +133,113 @@ def _initialize_cluster_centers(self, x: DNDarray):
                 raise ValueError("passed centroids do not match cluster count or data shape")
             self._cluster_centers = self.init.resplit(None)
 
-        # Smart centroid guessing, random sampling with probability weight proportional to distance to existing centroids
+        # Parallelized centroid guessing using the k-means|| algorithm
         elif self.init == "probability_based":
+            # First, check along which axis the data is sliced
             if x.split is None or x.split == 0:
-                centroids = ht.zeros(
-                    (self.n_clusters, x.shape[1]), split=None, device=x.device, comm=x.comm
-                )
-                sample = ht.random.randint(0, x.shape[0] - 1).item()
-                _, displ, _ = x.comm.counts_displs_shape(shape=x.shape, axis=0)
-                proc = 0
-                for p in range(x.comm.size):
-                    if displ[p] > sample:
-                        break
-                    proc = p
-                x0 = ht.zeros(x.shape[1], dtype=x.dtype, device=x.device, comm=x.comm)
-                if x.comm.rank == proc:
-                    idx = sample - displ[proc]
-                    x0 = ht.array(x.lloc[idx, :], device=x.device, comm=x.comm)
-                x0.comm.Bcast(x0, root=proc)
-                centroids[0, :] = x0
-                for i in range(1, self.n_clusters):
-                    distances = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
-                    D2 = distances.min(axis=1)
-                    D2.resplit_(axis=None)
-                    prob = D2 / D2.sum()
-                    random_position = ht.random.rand()
-                    sample = 0
-                    sum = 0
-                    for j in range(len(prob)):
-                        if sum > random_position:
-                            break
-                        sum += prob[j].item()
-                        sample = j
-                    proc = 0
-                    for p in range(x.comm.size):
-                        if displ[p] > sample:
-                            break
-                        proc = p
-                    xi = ht.zeros(x.shape[1], dtype=x.dtype)
-                    if x.comm.rank == proc:
-                        idx = sample - displ[proc]
-                        xi = ht.array(x.lloc[idx, :], device=x.device, comm=x.comm)
-                    xi.comm.Bcast(xi, root=proc)
-                    centroids[i, :] = xi
-
+                # Define a list of random, uniformly distributed probabilities, which is later used to sample the centroids
+                sample = ht.random.rand(x.shape[0], split=x.split)
+                # Define a random integer serving as a label to pick the first centroid randomly
+                init_idx = ht.random.randint(0, x.shape[0] - 1).item()
+                # Randomly select first centroid and organize it as a tensor, in order to use the function cdist later.
+                # This tensor will be filled continously in the proceeding of this function
+                # We assume that the centroids fit into the memory of a single GPU
+                centroids = ht.expand_dims(x[init_idx, :].resplit_(None), axis=0)
+                # Calculate the initial cost of the clustering after the first centroid selection
+                # and use it as an indicator for the number of necessary iterations
+                # --> First calculate the Euclidean distance between data points x and initial centroids
+                # output format: tensor
+                init_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
+                # --> Pick the minimal distance of the data points to each centroid
+                # output format: vector
+                init_min_distance = init_distance.min(axis=1)
+                # --> Now calculate the cost
+                # output format: scalar
+                init_cost = init_min_distance.sum()
+                # Iteratively fill the tensor storing the centroids
+                for _ in ht.arange(0, iter_multiplier * ht.log(init_cost)):
+                    # Calculate the distance between data points and the current set of centroids
+                    distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
+                    min_distance = distance.min(axis=1)
+                    # Sample each point in the data to a new set of centroids
+                    # -->   probability distribution with oversampling factor
+                    #       output format: vector
+                    prob = oversampling * min_distance / min_distance.sum()
+                    # -->   choose indices to sample the data according to prob
+                    #       output format: vector
+                    idx = ht.where(sample <= prob)
+                    # -->   stack the data points with these indices to the DNDarray of centroids
+                    #       output format: tensor
+                    """print(f"idx={idx}")
+                    if idx.shape[0]!=0:
+                        print(f"idx={idx}, idx.shape={idx.shape}, x[idx]={x[idx]}")
+                        local_data= x[idx].resplit_(centroids.split) # make sure, that the data points we append to centroids are split in the same way
+                        centroids=ht.row_stack((centroids,local_data)) """
+                    # print(f"x[idx]={x[idx]}, x[idx].shape={x[idx].shape}, process= {ht.MPI_WORLD.rank}\n")
+                    # print(f"centroids.split={centroids.split}, process= {ht.MPI_WORLD.rank}\n")
+                    # if idx.shape[0]!=0:
+                    local_data = x[idx].resplit_(
+                        centroids.split
+                    )  # make sure, that the data points we append to centroids are split in the same way
+                    # local_data=x[idx]
+                    # print(f"x[1]={x[1]}, local_data={local_data}, process= {ht.MPI_WORLD.rank}\n")
+                    centroids = ht.row_stack((centroids, local_data))
+                # Evaluate distance between final centroids and data points
+                if centroids.shape[0] <= self.n_clusters:
+                    raise ValueError(
+                        "The oversampling factor and/or the number of iterations are chosen two small for the initialization of cluster centers."
+                    )
+                final_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
+                # For each data point in x, find the index of the centroid that is closest
+                final_idx = ht.argmin(final_distance, axis=1)
+                # Introduce weights, i.e., the number of data points closest to each centroid
+                # (count how often the same index in final_idx occurs)
+                weights = ht.zeros(centroids.shape[0], split=centroids.split)
+                for i in range(centroids.shape[0]):
+                    weights[i] = ht.sum(final_idx == i)
+                # Recluster the oversampled centroids using standard k-means ++ (here we use the
+                # already implemented version in torch)
+                # --> first transform relevant arrays into torch tensors
+                centroids = centroids.resplit_(None)
+                centroids = centroids.larray
+                weights = weights.resplit_(None)
+                weights = weights.larray
+                # --> apply k-means ++
+                if ht.MPI_WORLD.rank == 0:
+                    batch_kmeans = _kmex(
+                        centroids,
+                        p=2,
+                        n_clusters=self.n_clusters,
+                        init="++",
+                        max_iter=self.max_iter,
+                        tol=self.tol,
+                        random_state=None,
+                        weights=weights,
+                    )
+                    reclustered_centroids = batch_kmeans[0]  # access the reclustered centroids
+                else:
+                    # ensure that all processes have the same data
+                    # tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory with the correct type (necessary for broadcast)
+                    reclustered_centroids = torch.zeros(
+                        (self.n_clusters, centroids.shape[1]),
+                        dtype=x.dtype.torch_type(),
+                        device=centroids.device,
+                    )
+                ht.MPI_WORLD.Bcast(
+                    reclustered_centroids, root=0
+                )  # by default it is broadcasted from process 0
+                # -------------------------------------------------------------------------------
+                # print(f"reclustered centroids in initilialize_cluster_centers (after applying kmex)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n")
+                # -------------------------------------------------------------------------------
+                # --> transform back to DNDarray
+                reclustered_centroids = ht.array(reclustered_centroids, split=x.split)
+                # final result
+                self._cluster_centers = reclustered_centroids
+                # -------------------------------------------------------------------------------
+                # print(f"reclustered centroids in initilialize_cluster_centers (final result)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n")
+                # -------------------------------------------------------------------------------
             else:
                 raise NotImplementedError("Not implemented for other splitting-axes")
-            self._cluster_centers = centroids
 
         elif self.init == "batchparallel":
             if x.split == 0:
diff --git a/heat/cluster/batchparallelclustering.py b/heat/cluster/batchparallelclustering.py
index 257b88c18..d6d756ef5 100644
--- a/heat/cluster/batchparallelclustering.py
+++ b/heat/cluster/batchparallelclustering.py
@@ -4,7 +4,8 @@
 
 import heat as ht
 import torch
-from heat.cluster._kcluster import _KCluster
+
+# from heat.cluster._kcluster import _KCluster
 from heat.core.dndarray import DNDarray
 from warnings import warn
 from math import log
@@ -19,10 +20,14 @@
 """
 
 
-def _initialize_plus_plus(X, n_clusters, p, random_state=None, max_samples=2**24 - 1):
+def _initialize_plus_plus(
+    X, n_clusters, p, random_state=None, weights: torch.tensor = 1, max_samples=2**24 - 1
+):
     """
     Auxiliary function: single-process k-means++/k-medians++ initialization in pytorch
     p is the norm used for computing distances
+    weights allows to add weights to the distribution function, so that the data points with higher weights are preferred;
+    note that weights must have the same dimension as X[0]
     The value max_samples=2**24 - 1 is necessary as PyTorchs multinomial currently only
     supports this number of different categories.
     """
@@ -37,11 +42,11 @@ def _initialize_plus_plus(X, n_clusters, p, random_state=None, max_samples=2**24
     for i in range(1, n_clusters):
         dist = torch.cdist(X, X[idxs[:i]], p=p)
         dist = torch.min(dist, dim=1)[0]
-        idxs[i] = torch.multinomial(dist, 1)
+        idxs[i] = torch.multinomial(weights * dist, 1)
     return X[idxs]
 
 
-def _kmex(X, p, n_clusters, init, max_iter, tol, random_state=None):
+def _kmex(X, p, n_clusters, init, max_iter, tol, random_state=None, weights: torch.tensor = 1.0):
     """
     Auxiliary function: single-process k-means and k-medians in pytorch
     p is the norm used for computing distances: p=2 implies k-means, p=1 implies k-medians.
@@ -55,7 +60,7 @@ def _kmex(X, p, n_clusters, init, max_iter, tol, random_state=None):
             raise ValueError("if a torch tensor, init must have shape (n_clusters, n_features).")
         centers = init
     elif init == "++":
-        centers = _initialize_plus_plus(X, n_clusters, p, random_state)
+        centers = _initialize_plus_plus(X, n_clusters, p, random_state, weights)
     elif init == "random":
         idxs = torch.randint(0, X.shape[0], (n_clusters,))
         centers = X[idxs]
@@ -169,7 +174,7 @@ def functional_value_(self) -> float:
         """
         return self._functional_value
 
-    def fit(self, x: DNDarray):
+    def fit(self, x: DNDarray, weights: torch.tensor = 1):
         """
         Computes the centroid of the clustering algorithm to fit the data ``x``.
 
@@ -178,6 +183,8 @@ def fit(self, x: DNDarray):
         x : DNDarray
             Training instances to cluster. Shape = (n_samples, n_features). It must hold x.split=0.
 
+        weights: torch.tensor
+            Add weights to the distribution function used in the clustering algorithm in kmex
         """
         if not isinstance(x, DNDarray):
             raise TypeError(f"input needs to be a ht.DNDarray, but was {type(x)}")
@@ -198,6 +205,7 @@ def fit(self, x: DNDarray):
             self.max_iter,
             self.tol,
             local_random_state,
+            weights,
         )
 
         # hierarchical approach to obtail "global" cluster centers from the "local" centers
@@ -233,6 +241,7 @@ def fit(self, x: DNDarray):
                         self.max_iter,
                         self.tol,
                         local_random_state,
+                        weights,
                     )
                     del gathered_centers_local
                     n_iters_local += n_iters_local_new
diff --git a/heat/cluster/kmeans.py b/heat/cluster/kmeans.py
index 96067aa82..9a247bc42 100644
--- a/heat/cluster/kmeans.py
+++ b/heat/cluster/kmeans.py
@@ -102,7 +102,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray):
 
         return new_cluster_centers
 
-    def fit(self, x: DNDarray) -> self:
+    def fit(self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20) -> self:
         """
         Computes the centroid of a k-means clustering.
 
@@ -111,13 +111,18 @@ def fit(self, x: DNDarray) -> self:
         x : DNDarray
             Training instances to cluster. Shape = (n_samples, n_features)
 
+        oversampling : float
+            oversampling factor used for the k-means|| initializiation of centroids
+
+        iter_multiplier : float
+            factor that increases the number of iterations used in the initialization of centroids
         """
         # input sanitation
         if not isinstance(x, DNDarray):
             raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}")
 
         # initialize the clustering
-        self._initialize_cluster_centers(x)
+        self._initialize_cluster_centers(x, oversampling, iter_multiplier)
         self._n_iter = 0
 
         # iteratively fit the points to the centroids
diff --git a/heat/cluster/kmedians.py b/heat/cluster/kmedians.py
index c7d991b1f..0bd2cbb66 100644
--- a/heat/cluster/kmedians.py
+++ b/heat/cluster/kmedians.py
@@ -65,6 +65,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray):
         ----------
         x :  DNDarray
             Input data
+
         matching_centroids : DNDarray
             Array filled with indeces ``i`` indicating to which cluster ``ci`` each sample point in x is assigned
 
@@ -103,7 +104,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray):
 
         return new_cluster_centers
 
-    def fit(self, x: DNDarray):
+    def fit(self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20):
         """
         Computes the centroid of a k-medians clustering.
 
@@ -111,13 +112,19 @@ def fit(self, x: DNDarray):
         ----------
         x : DNDarray
             Training instances to cluster. Shape = (n_samples, n_features)
+
+        oversampling : float
+            oversampling factor used in the k-means|| initializiation of centroids
+
+        iter_multiplier : float
+            factor that increases the number of iterations used in the initialization of centroids
         """
         # input sanitation
         if not isinstance(x, ht.DNDarray):
             raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}")
 
         # initialize the clustering
-        self._initialize_cluster_centers(x)
+        self._initialize_cluster_centers(x, oversampling, iter_multiplier)
         self._n_iter = 0
 
         # iteratively fit the points to the centroids
diff --git a/heat/cluster/kmedoids.py b/heat/cluster/kmedoids.py
index 0eb38a5eb..ec20dd24f 100644
--- a/heat/cluster/kmedoids.py
+++ b/heat/cluster/kmedoids.py
@@ -114,7 +114,7 @@ def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray):
 
         return new_cluster_centers
 
-    def fit(self, x: DNDarray):
+    def fit(self, x: DNDarray, oversampling: float = 100, iter_multiplier: float = 20):
         """
         Computes the centroid of a k-medoids clustering.
 
@@ -122,13 +122,18 @@ def fit(self, x: DNDarray):
         ----------
         x : DNDarray
             Training instances to cluster. Shape = (n_samples, n_features)
+        oversampling : float
+            oversampling factor used in the k-means|| initializiation of centroids
+
+        iter_multiplier : float
+            factor that increases the number of iterations used in the initialization of centroids
         """
         # input sanitation
         if not isinstance(x, DNDarray):
             raise ValueError(f"input needs to be a ht.DNDarray, but was {type(x)}")
 
         # initialize the clustering
-        self._initialize_cluster_centers(x)
+        self._initialize_cluster_centers(x, oversampling, iter_multiplier)
         self._n_iter = 0
 
         # iteratively fit the points to the centroids
diff --git a/heat/cluster/mytest.py b/heat/cluster/mytest.py
index 30e40b186..6a5783fa0 100644
--- a/heat/cluster/mytest.py
+++ b/heat/cluster/mytest.py
@@ -1,4 +1,180 @@
+"""
+Some tests to check the funtionality of the k-means clustering algortihm
+"""
+
 import heat as ht
+import numpy as np
+import torch
+import time
+
+ht.use_device("gpu")
+# Convert data into DNDarrays
+# The shape of this data is (3,5), i.e.,
+# 3 data points, each consisting of 5 features
+x = [[1, 2, 3, 4, 5], [10, 20, 30, 40, 50], [0, 2, 3, 4, 4]]
+unit = ht.ones((3, 5), split=None)
+unitvector = ht.ones((1, 5), split=None)
+v = [[20, 30, 40, 5, 6], [11, 22, 33, 44, 55], [102, 204, 303, 406, 507], [30, 44, 53, 66, 77]]
+y = ht.array(x)
+w = ht.array(v)
+# Split the data along different axes
+y0 = ht.array(x, split=0)
+y1 = ht.array(x, split=1)
+# Convert data, labels, and centers from heat tensors to numpy arrays
+# larray
+y_as_np = y0.resplit_(None).larray.cpu().numpy()
+# output the shape
+y_shape0 = y0.shape
+# print the number of features in each data point
+n_features = y0.shape[1]
+# calculate Euclidean distance between each
+# row-vector in y and w
+# !!! Important !!!
+# ---> the arguments of cdist must be 2D tensors, i.e., ht.array([[1,2,3]]) instead of ht.array([1,2,3])
+dist = ht.spatial.distance.cdist(y, w)
+# pick the minimum value of a tensor along the axis=1
+min_dist = dist.min(axis=0)
+# define a tensor with the same dimension as y and fill it with zeros
+centroids = ht.zeros((y.shape[0], y.shape[1]))
+# replace the 0th row vector of "centroids" by a randomly chosen row vector of y
+sample = ht.random.randint(0, y.shape[0] - 1).item()
+centroids[0, :] = y[sample]
+# Useful for degubbing: keep track auf matrix shapes and the process (i.e., the gpu) the data is assigned to
+print(f"centroids.shape{centroids.shape}, process= {ht.MPI_WORLD.rank}\n")
+# stack two vectors together
+# a=ht.array([1,2,3,4])
+# b=ht.array([10,20,30,40])
+# a=ht.array(2)
+# b=ht.array(3)
+# stacked_ab=ht.stack((a,b),axis=0)
+# add dimensions
+a_vector = ht.array([1, 2, 3, 4])
+new_x = ht.expand_dims(a_vector, axis=0)  # output: [[1,2,3,4]]
+# stack two vectors together and flatten, so that the outcome is similar to the command "append"
+a = ht.array([[1, 2, 3, 4], [1, 5, 3, 4], [1, 2, 3, 42]])
+# b=ht.array([[10,20,30,40],[10,20,30,40],[1,2,3,4]])
+# stacked_ab=ht.stack((a,b),axis=0)
+# reshaped_stacked_ab=ht.reshape(stacked_ab,(stacked_ab.shape[0]*stacked_ab.shape[1],stacked_ab.shape[2]))
+b = ht.array([[10, 20, 30, 40], [10, 20, 30, 40]])
+stacked_ab = ht.row_stack((a, b))
+# create random numbers between 0 and 1
+random = ht.random.rand(y.shape[0])
+# translate into a uniform probability distribution
+random_prob = random / random.sum()
+# find the indices for which the condition test1<test holds (is to be understood elementwise)
+test = ht.array([0.3, 0.5, 0.8])
+test1 = ht.array([0.2, 0.6, 0.4])
+find_indices = ht.where(test1 < test)
+# find the largest value in a vector
+some_vector = np.array([1, 2, 4, 4])
+some_vector_max = (
+    some_vector.max()
+)  # when dealing with ht.array one should add an .item() at the end, to ensure that the dndarray or torch tensor is transformed to a scalar
+weights = torch.tensor(np.array([np.sum(some_vector == i) for i in range(0, some_vector.shape[0])]))
+"""     # ensure that all processes have the same data
+if ht.MPI_WORLD.rank == 0:
+    weights=weights
+else:
+# tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory (necessary for broadcast)
+    weights = torch.zeros(
+        (weights.shape[0], weights.shape[1]), dtype=x.dtype.torch_type(), device=centroids.device)
+    ht.MPI_WORLD.Bcast(
+        weights, root=0) """
+
+from batchparallelclustering import _initialize_plus_plus, BatchParallelKMeans
+
+""" X = torch.rand(100, 3)
+W = torch.tensor(w.larray)
+"""
+""" n_clusters=3
+BPK=BatchParallelKMeans(n_clusters) """
+from heat.utils.data.spherical import create_spherical_dataset
+
+""" data = create_spherical_dataset(
+            num_samples_cluster=100, radius=1.0, offset=4.0, dtype=ht.float32, random_state=1
+        )
+data=ht.array(data,split=0) """
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+"""
+def plot_clusters(data, labels, centers, title="Clustering Visualization"):
+    # Visualizes clustered data in 2D or 3D.
+    # Parameters:
+    # - data (numpy.ndarray): Input data of shape (n_samples, n_features).
+    # - labels (numpy.ndarray): Cluster labels for each point (optional).
+    # - centers (numpy.ndarray): Coordinates of cluster centers (optional).
+    # - title (str): Title of the plot.
+    # Determine dimensionality
+    dim = data.shape[1]
+    if dim not in [2, 3]:
+        raise ValueError("Data must be 2D or 3D for plotting.")
+    # Set up plot
+    fig = plt.figure(figsize=(8, 8))
+    if dim == 2:
+        ax = fig.add_subplot(111)
+    else:
+        ax = fig.add_subplot(111, projection="3d")
+    unique_labels = np.unique(labels)
+    # Loop through unique labels (clusters)
+    for i in unique_labels:
+        cluster_data = data[labels == i]  # Get all data points for the current label
+        if dim == 2:
+            ax.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f"Cluster {i}")
+        else:
+            ax.scatter(
+                cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], label=f"Cluster {i}"
+            )
+    # Plot cluster centers if provided
+    if dim == 2:
+        ax.scatter(centers[:, 0], centers[:, 1], c="red", marker="x", s=200, label="Centers")
+    else:
+        ax.scatter(
+            centers[:, 0],
+            centers[:, 1],
+            centers[:, 2],
+            c="red",
+            marker="x",
+            s=200,
+            label="Centers",
+        )
+    # Add labels and legend
+    ax.set_title(title)
+    ax.set_xlabel("Feature 1")
+    ax.set_ylabel("Feature 2")
+    if dim == 3:
+        ax.set_zlabel("Feature 3")
+    ax.legend()
+    plt.savefig("plot.pdf")
+    plt.show() """
+
+
+# Example usage:
+# Assuming you have your data, labels, and centers in numpy arrays
+print("Start plotting \n\n\n")
+data = ht.utils.data.spherical.create_spherical_dataset(
+    num_samples_cluster=20000000, radius=2.0, offset=10.0, dtype=ht.float32, random_state=1
+)
+# data = data[:, :-1]
+
+start_time = time.time()
+kmeans = ht.cluster.KMeans(n_clusters=4, init="kmeans++", max_iter=400)
+# kmeans = ht.cluster.KMedians(n_clusters=5, init="kmedians++", max_iter=400)
+kmeans.fit(data, oversampling=10, iter_multiplier=1)
+end_time = time.time()
+
+# Laufzeit berechnen
+print(f"Runtime for clustering: {end_time - start_time:.4f} Sekunden")
 
-ht.use_device('gpu')
-ht.zeros((3, 4,))
+labels = kmeans._labels
+labels = ht.reshape(labels, labels.shape[0])
+centers = kmeans._cluster_centers
+# Convert data, labels, and centers from heat tensors to numpy arrays
+data = data.numpy()
+# data = data.resplit_(None).larray.cpu().numpy()
+labels = labels.resplit_(None).larray.cpu().numpy()
+centers = centers.resplit_(None).larray.cpu().numpy()
+# print("centroids= ", centers)
+# Call the plot function
+# plot_clusters(data, labels, centers)
diff --git a/heat/cluster/tests/test_kmedoids.py b/heat/cluster/tests/test_kmedoids.py
index b04d29a52..f614899b2 100644
--- a/heat/cluster/tests/test_kmedoids.py
+++ b/heat/cluster/tests/test_kmedoids.py
@@ -112,10 +112,7 @@ def test_spherical_clusters(self):
         self.assertEqual(kmedoid.cluster_centers_.shape, (4, 3))
         for i in range(kmedoid.cluster_centers_.shape[0]):
             self.assertTrue(
-                ht.any(
-                    ht.sum(ht.abs(kmedoid.cluster_centers_[i, :] - data.astype(ht.float32)), axis=1)
-                    == 0
-                )
+                ht.any(ht.sum(ht.abs(kmedoid.cluster_centers_[i, :] - data), axis=1) == 0)
             )
 
         # on Ints (different radius, offset and datatype
diff --git a/heat/core/indexing.py b/heat/core/indexing.py
index 33d94c04d..99796273e 100644
--- a/heat/core/indexing.py
+++ b/heat/core/indexing.py
@@ -69,8 +69,9 @@ def nonzero(x: DNDarray) -> DNDarray:
 
     if x.ndim == 1:
         lcl_nonzero = lcl_nonzero.squeeze(dim=1)
+
     for g in range(len(gout) - 1, -1, -1):
-        if gout[g] == 1:
+        if gout[g] == 1 and len(gout) > 1:
             del gout[g]
 
     return DNDarray(

From 7f860c41cd3dc5f03d236f83250b2153b7ff6bf6 Mon Sep 17 00:00:00 2001
From: Akdag <akda_me@sc-030332l.intra.dlr.de>
Date: Tue, 17 Dec 2024 12:48:39 +0100
Subject: [PATCH 3/3] Refined comments for better readability

---
 heat/cluster/_kcluster.py |  63 ++++++-------
 heat/cluster/mytest.py    | 180 --------------------------------------
 2 files changed, 28 insertions(+), 215 deletions(-)
 delete mode 100644 heat/cluster/mytest.py

diff --git a/heat/cluster/_kcluster.py b/heat/cluster/_kcluster.py
index 6029cc721..c94cf7e04 100644
--- a/heat/cluster/_kcluster.py
+++ b/heat/cluster/_kcluster.py
@@ -137,7 +137,8 @@ def _initialize_cluster_centers(
         elif self.init == "probability_based":
             # First, check along which axis the data is sliced
             if x.split is None or x.split == 0:
-                # Define a list of random, uniformly distributed probabilities, which is later used to sample the centroids
+                # Define a list of random, uniformly distributed probabilities,
+                # which is later used to sample the centroids
                 sample = ht.random.rand(x.shape[0], split=x.split)
                 # Define a random integer serving as a label to pick the first centroid randomly
                 init_idx = ht.random.randint(0, x.shape[0] - 1).item()
@@ -146,49 +147,43 @@ def _initialize_cluster_centers(
                 # We assume that the centroids fit into the memory of a single GPU
                 centroids = ht.expand_dims(x[init_idx, :].resplit_(None), axis=0)
                 # Calculate the initial cost of the clustering after the first centroid selection
-                # and use it as an indicator for the number of necessary iterations
-                # --> First calculate the Euclidean distance between data points x and initial centroids
-                # output format: tensor
+                # and use it as an indicator for the order of magnitude for the number of necessary iterations
                 init_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
+                # --> init_distance calculates the Euclidean distance between data points x and initial centroids
+                # output format: tensor
+                init_min_distance = init_distance.min(axis=1)
                 # --> Pick the minimal distance of the data points to each centroid
                 # output format: vector
-                init_min_distance = init_distance.min(axis=1)
+                init_cost = init_min_distance.sum()
                 # --> Now calculate the cost
                 # output format: scalar
-                init_cost = init_min_distance.sum()
+                #
                 # Iteratively fill the tensor storing the centroids
                 for _ in ht.arange(0, iter_multiplier * ht.log(init_cost)):
                     # Calculate the distance between data points and the current set of centroids
                     distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
                     min_distance = distance.min(axis=1)
                     # Sample each point in the data to a new set of centroids
+                    prob = oversampling * min_distance / min_distance.sum()
                     # -->   probability distribution with oversampling factor
                     #       output format: vector
-                    prob = oversampling * min_distance / min_distance.sum()
+                    idx = ht.where(sample <= prob)
                     # -->   choose indices to sample the data according to prob
                     #       output format: vector
-                    idx = ht.where(sample <= prob)
+                    local_data = x[idx].resplit_(centroids.split)
+                    # -->   pick the data points that are identified as possible centroids and make sure
+                    #       that data points and centroids are split in the same way
+                    #       output format: vector
+                    centroids = ht.row_stack((centroids, local_data))
                     # -->   stack the data points with these indices to the DNDarray of centroids
                     #       output format: tensor
-                    """print(f"idx={idx}")
-                    if idx.shape[0]!=0:
-                        print(f"idx={idx}, idx.shape={idx.shape}, x[idx]={x[idx]}")
-                        local_data= x[idx].resplit_(centroids.split) # make sure, that the data points we append to centroids are split in the same way
-                        centroids=ht.row_stack((centroids,local_data)) """
-                    # print(f"x[idx]={x[idx]}, x[idx].shape={x[idx].shape}, process= {ht.MPI_WORLD.rank}\n")
-                    # print(f"centroids.split={centroids.split}, process= {ht.MPI_WORLD.rank}\n")
-                    # if idx.shape[0]!=0:
-                    local_data = x[idx].resplit_(
-                        centroids.split
-                    )  # make sure, that the data points we append to centroids are split in the same way
-                    # local_data=x[idx]
-                    # print(f"x[1]={x[1]}, local_data={local_data}, process= {ht.MPI_WORLD.rank}\n")
-                    centroids = ht.row_stack((centroids, local_data))
                 # Evaluate distance between final centroids and data points
                 if centroids.shape[0] <= self.n_clusters:
                     raise ValueError(
-                        "The oversampling factor and/or the number of iterations are chosen two small for the initialization of cluster centers."
+                        "The oversampling factor and/or the number of iterations are chosen"
+                        "too small for the initialization of cluster centers."
                     )
+                # Evaluate the distance between data and the final set of centroids for the initialization
                 final_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
                 # For each data point in x, find the index of the centroid that is closest
                 final_idx = ht.argmin(final_distance, axis=1)
@@ -199,12 +194,11 @@ def _initialize_cluster_centers(
                     weights[i] = ht.sum(final_idx == i)
                 # Recluster the oversampled centroids using standard k-means ++ (here we use the
                 # already implemented version in torch)
-                # --> first transform relevant arrays into torch tensors
                 centroids = centroids.resplit_(None)
                 centroids = centroids.larray
                 weights = weights.resplit_(None)
                 weights = weights.larray
-                # --> apply k-means ++
+                # --> first transform relevant arrays into torch tensors
                 if ht.MPI_WORLD.rank == 0:
                     batch_kmeans = _kmex(
                         centroids,
@@ -216,28 +210,27 @@ def _initialize_cluster_centers(
                         random_state=None,
                         weights=weights,
                     )
-                    reclustered_centroids = batch_kmeans[0]  # access the reclustered centroids
+                    # --> apply standard k-means ++
+                    #     Note: as we only recluster the centroids for initialization with standard k-means ++,
+                    #     this list of centroids can also be used to initialize k-medians and k-medoids
+                    reclustered_centroids = batch_kmeans[0]
+                    # --> access the reclustered centroids
                 else:
                     # ensure that all processes have the same data
-                    # tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory with the correct type (necessary for broadcast)
                     reclustered_centroids = torch.zeros(
                         (self.n_clusters, centroids.shape[1]),
                         dtype=x.dtype.torch_type(),
                         device=centroids.device,
                     )
+                    # -->  tensor with zeros that has the same size as reclustered centroids, in order to to
+                    #      allocate memory with the correct type in all processes(necessary for broadcast)
                 ht.MPI_WORLD.Bcast(
                     reclustered_centroids, root=0
                 )  # by default it is broadcasted from process 0
-                # -------------------------------------------------------------------------------
-                # print(f"reclustered centroids in initilialize_cluster_centers (after applying kmex)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n")
-                # -------------------------------------------------------------------------------
-                # --> transform back to DNDarray
                 reclustered_centroids = ht.array(reclustered_centroids, split=x.split)
-                # final result
+                # --> transform back to DNDarray
                 self._cluster_centers = reclustered_centroids
-                # -------------------------------------------------------------------------------
-                # print(f"reclustered centroids in initilialize_cluster_centers (final result)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n")
-                # -------------------------------------------------------------------------------
+                # --> final result for initialized cluster centers
             else:
                 raise NotImplementedError("Not implemented for other splitting-axes")
 
diff --git a/heat/cluster/mytest.py b/heat/cluster/mytest.py
deleted file mode 100644
index 6a5783fa0..000000000
--- a/heat/cluster/mytest.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Some tests to check the funtionality of the k-means clustering algortihm
-"""
-
-import heat as ht
-import numpy as np
-import torch
-import time
-
-ht.use_device("gpu")
-# Convert data into DNDarrays
-# The shape of this data is (3,5), i.e.,
-# 3 data points, each consisting of 5 features
-x = [[1, 2, 3, 4, 5], [10, 20, 30, 40, 50], [0, 2, 3, 4, 4]]
-unit = ht.ones((3, 5), split=None)
-unitvector = ht.ones((1, 5), split=None)
-v = [[20, 30, 40, 5, 6], [11, 22, 33, 44, 55], [102, 204, 303, 406, 507], [30, 44, 53, 66, 77]]
-y = ht.array(x)
-w = ht.array(v)
-# Split the data along different axes
-y0 = ht.array(x, split=0)
-y1 = ht.array(x, split=1)
-# Convert data, labels, and centers from heat tensors to numpy arrays
-# larray
-y_as_np = y0.resplit_(None).larray.cpu().numpy()
-# output the shape
-y_shape0 = y0.shape
-# print the number of features in each data point
-n_features = y0.shape[1]
-# calculate Euclidean distance between each
-# row-vector in y and w
-# !!! Important !!!
-# ---> the arguments of cdist must be 2D tensors, i.e., ht.array([[1,2,3]]) instead of ht.array([1,2,3])
-dist = ht.spatial.distance.cdist(y, w)
-# pick the minimum value of a tensor along the axis=1
-min_dist = dist.min(axis=0)
-# define a tensor with the same dimension as y and fill it with zeros
-centroids = ht.zeros((y.shape[0], y.shape[1]))
-# replace the 0th row vector of "centroids" by a randomly chosen row vector of y
-sample = ht.random.randint(0, y.shape[0] - 1).item()
-centroids[0, :] = y[sample]
-# Useful for degubbing: keep track auf matrix shapes and the process (i.e., the gpu) the data is assigned to
-print(f"centroids.shape{centroids.shape}, process= {ht.MPI_WORLD.rank}\n")
-# stack two vectors together
-# a=ht.array([1,2,3,4])
-# b=ht.array([10,20,30,40])
-# a=ht.array(2)
-# b=ht.array(3)
-# stacked_ab=ht.stack((a,b),axis=0)
-# add dimensions
-a_vector = ht.array([1, 2, 3, 4])
-new_x = ht.expand_dims(a_vector, axis=0)  # output: [[1,2,3,4]]
-# stack two vectors together and flatten, so that the outcome is similar to the command "append"
-a = ht.array([[1, 2, 3, 4], [1, 5, 3, 4], [1, 2, 3, 42]])
-# b=ht.array([[10,20,30,40],[10,20,30,40],[1,2,3,4]])
-# stacked_ab=ht.stack((a,b),axis=0)
-# reshaped_stacked_ab=ht.reshape(stacked_ab,(stacked_ab.shape[0]*stacked_ab.shape[1],stacked_ab.shape[2]))
-b = ht.array([[10, 20, 30, 40], [10, 20, 30, 40]])
-stacked_ab = ht.row_stack((a, b))
-# create random numbers between 0 and 1
-random = ht.random.rand(y.shape[0])
-# translate into a uniform probability distribution
-random_prob = random / random.sum()
-# find the indices for which the condition test1<test holds (is to be understood elementwise)
-test = ht.array([0.3, 0.5, 0.8])
-test1 = ht.array([0.2, 0.6, 0.4])
-find_indices = ht.where(test1 < test)
-# find the largest value in a vector
-some_vector = np.array([1, 2, 4, 4])
-some_vector_max = (
-    some_vector.max()
-)  # when dealing with ht.array one should add an .item() at the end, to ensure that the dndarray or torch tensor is transformed to a scalar
-weights = torch.tensor(np.array([np.sum(some_vector == i) for i in range(0, some_vector.shape[0])]))
-"""     # ensure that all processes have the same data
-if ht.MPI_WORLD.rank == 0:
-    weights=weights
-else:
-# tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory (necessary for broadcast)
-    weights = torch.zeros(
-        (weights.shape[0], weights.shape[1]), dtype=x.dtype.torch_type(), device=centroids.device)
-    ht.MPI_WORLD.Bcast(
-        weights, root=0) """
-
-from batchparallelclustering import _initialize_plus_plus, BatchParallelKMeans
-
-""" X = torch.rand(100, 3)
-W = torch.tensor(w.larray)
-"""
-""" n_clusters=3
-BPK=BatchParallelKMeans(n_clusters) """
-from heat.utils.data.spherical import create_spherical_dataset
-
-""" data = create_spherical_dataset(
-            num_samples_cluster=100, radius=1.0, offset=4.0, dtype=ht.float32, random_state=1
-        )
-data=ht.array(data,split=0) """
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-"""
-def plot_clusters(data, labels, centers, title="Clustering Visualization"):
-    # Visualizes clustered data in 2D or 3D.
-    # Parameters:
-    # - data (numpy.ndarray): Input data of shape (n_samples, n_features).
-    # - labels (numpy.ndarray): Cluster labels for each point (optional).
-    # - centers (numpy.ndarray): Coordinates of cluster centers (optional).
-    # - title (str): Title of the plot.
-    # Determine dimensionality
-    dim = data.shape[1]
-    if dim not in [2, 3]:
-        raise ValueError("Data must be 2D or 3D for plotting.")
-    # Set up plot
-    fig = plt.figure(figsize=(8, 8))
-    if dim == 2:
-        ax = fig.add_subplot(111)
-    else:
-        ax = fig.add_subplot(111, projection="3d")
-    unique_labels = np.unique(labels)
-    # Loop through unique labels (clusters)
-    for i in unique_labels:
-        cluster_data = data[labels == i]  # Get all data points for the current label
-        if dim == 2:
-            ax.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f"Cluster {i}")
-        else:
-            ax.scatter(
-                cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], label=f"Cluster {i}"
-            )
-    # Plot cluster centers if provided
-    if dim == 2:
-        ax.scatter(centers[:, 0], centers[:, 1], c="red", marker="x", s=200, label="Centers")
-    else:
-        ax.scatter(
-            centers[:, 0],
-            centers[:, 1],
-            centers[:, 2],
-            c="red",
-            marker="x",
-            s=200,
-            label="Centers",
-        )
-    # Add labels and legend
-    ax.set_title(title)
-    ax.set_xlabel("Feature 1")
-    ax.set_ylabel("Feature 2")
-    if dim == 3:
-        ax.set_zlabel("Feature 3")
-    ax.legend()
-    plt.savefig("plot.pdf")
-    plt.show() """
-
-
-# Example usage:
-# Assuming you have your data, labels, and centers in numpy arrays
-print("Start plotting \n\n\n")
-data = ht.utils.data.spherical.create_spherical_dataset(
-    num_samples_cluster=20000000, radius=2.0, offset=10.0, dtype=ht.float32, random_state=1
-)
-# data = data[:, :-1]
-
-start_time = time.time()
-kmeans = ht.cluster.KMeans(n_clusters=4, init="kmeans++", max_iter=400)
-# kmeans = ht.cluster.KMedians(n_clusters=5, init="kmedians++", max_iter=400)
-kmeans.fit(data, oversampling=10, iter_multiplier=1)
-end_time = time.time()
-
-# Laufzeit berechnen
-print(f"Runtime for clustering: {end_time - start_time:.4f} Sekunden")
-
-labels = kmeans._labels
-labels = ht.reshape(labels, labels.shape[0])
-centers = kmeans._cluster_centers
-# Convert data, labels, and centers from heat tensors to numpy arrays
-data = data.numpy()
-# data = data.resplit_(None).larray.cpu().numpy()
-labels = labels.resplit_(None).larray.cpu().numpy()
-centers = centers.resplit_(None).larray.cpu().numpy()
-# print("centroids= ", centers)
-# Call the plot function
-# plot_clusters(data, labels, centers)