KMeans removed and updated to pass kwargs

mantidproject · Jan 2, 2025 · 23898fe · 23898fe
1 parent 661c29e
commit 23898fe
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 58 deletions.
diff --git a/diffraction/WISH/bragg-detect/cnn/BraggDetectCNN.py b/diffraction/WISH/bragg-detect/cnn/BraggDetectCNN.py
@@ -9,14 +9,12 @@
 from Diffraction.single_crystal.base_sx import BaseSX
 import time
 from enum import Enum
-from sklearn.cluster import KMeans, HDBSCAN
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.cluster import HDBSCAN
 from sklearn.metrics import silhouette_score
 
 class Clustering(Enum):
     QLab = 1
     HDBSCAN = 2
-    KMeans = 3
 
 
 class BraggDetectCNN:
@@ -47,94 +45,74 @@ def __init__(self, model_weights_path, batch_size=64, workers=0, iou_threshold=0
         self.iou_threshold = iou_threshold
 
 
-    def find_bragg_peaks(self, workspace, output_ws_name="CNN_Peaks", conf_threshold=0.0, **kwargs):
+    def find_bragg_peaks(self, workspace, output_ws_name="CNN_Peaks", conf_threshold=0.0, clustering=Clustering.QLab.name, **kwargs):
         """
         Find bragg peaks using the pre trained FasterRCNN model and create a peaks workspace
         :param workspace: Workspace name or the object of Workspace from WISH, ex: "WISH0042730"
         :param output_ws_name: Name of the peaks workspace
         :param conf_threshold: Confidence threshold to filter peaks inferred from RCNN
-        :param kwargs: variable keyword params for clustering. default is {"name": "Qlab", "q_tol": 0.05} 
-            Ex: {"name": "HDBSCAN", "keep_ignored_labels": True}
+        :param clustering: name of clustering method. Default is QLab and allowed
+        :param kwargs: variable keyword params for clustering methods
         """
-        clustering_params = {"name": "QLab", "q_tol": 0.05 }
-        clustering_params.update(kwargs)
-
         start_time = time.time()
         data_set, predicted_indices = self._do_cnn_inferencing(workspace)
 
         filtered_indices = predicted_indices[predicted_indices[:, -1] > conf_threshold]
 
         #Do Clustering
-        print(f"Starting peak clustering with { clustering_params['name'] } method..")
-        clustered_peaks = self._do_peak_clustering(filtered_indices, clustering_params)
-        print(f"Number of peaks after clustering is={len(clustered_peaks)}")
-
+        print(f"Starting peak clustering with {clustering} method..")
+        clustered_peaks = self._do_peak_clustering(filtered_indices, clustering, **kwargs)
         cluster_indices_rounded = np.round(clustered_peaks[:, :3]).astype(int)
         peaksws = createPeaksWorkspaceFromIndices(data_set.get_workspace(), output_ws_name, cluster_indices_rounded, data_set.get_ws_as_3d_array())
         for ipk, pk in enumerate(peaksws):
             pk.setIntensity(clustered_peaks[ipk, -1])
 
-        if clustering_params["name"] == Clustering.QLab.name:
+        if clustering == Clustering.QLab.name:
             #Filter peaks by qlab
-            BaseSX.remove_duplicate_peaks_by_qlab(peaksws, clustering_params["q_tol"])
+            clustering_params = {"q_tol": 0.05 }
+            clustering_params.update(kwargs)
+            BaseSX.remove_duplicate_peaks_by_qlab(peaksws, **clustering_params)
+
+        print(f"Number of peaks after clustering is = {len(peaksws)}")
 
         data_set.delete_rebunched_ws()
-        print(f"Bragg peaks finding from FasterRCNN model is completed in {time.time()-start_time} seconds!")
+        print(f"Bragg peaks finding from FasterRCNN model is completed in {time.time()-start_time:.2f} seconds!")
 
 
-    def _do_peak_clustering(self, detected_peaks, params):
-        print(f"Number of peaks before clustering={len(detected_peaks)}")
-        if params["name"] == Clustering.HDBSCAN.name:
-            return self._do_hdbscan_clustering(detected_peaks, params)
-        elif params["name"] == Clustering.KMeans.name:
-            return self._do_kmeans_clustering(detected_peaks)
+    def _do_peak_clustering(self, detected_peaks, clustering, **kwargs):
+        print(f"Number of peaks before clustering = {len(detected_peaks)}")
+        if clustering == Clustering.HDBSCAN.name:
+            return self._do_hdbscan_clustering(detected_peaks, **kwargs)
         else:
             return detected_peaks
 
 
-    def _do_hdbscan_clustering(self, peakdata, params):
+    def _do_hdbscan_clustering(self, peakdata, keep_ignored_labels=True, **kwargs):
         data = np.delete(peakdata, [3,4], axis=1)
-
-        hdbscan = HDBSCAN(min_cluster_size=2, 
-                          min_samples=2, 
-                          store_centers="medoid", 
-                          algorithm="auto", 
-                          cluster_selection_method="eom", 
-                          metric="euclidean")
+        if ("keep_ignored_labels" in kwargs):
+            keep_ignored_labels = kwargs.pop("keep_ignored_labels")
+
+        hdbscan_params = {"min_cluster_size": 2, 
+                          "min_samples": 2, 
+                          "store_centers" : "medoid", 
+                          "algorithm": "auto", 
+                          "cluster_selection_method": "eom", 
+                          "metric": "euclidean"
+                          }
+        hdbscan_params.update(kwargs)
+        hdbscan = HDBSCAN(**hdbscan_params)
         hdbscan.fit(data)
         print(f"Silhouette score of the clusters={silhouette_score(data, hdbscan.labels_)}")
 
-        if ("keep_ignored_labels" in params) and params["keep_ignored_labels"]:
-                selected_peaks = np.concatenate((hdbscan.medoids_, data[np.where(hdbscan.labels_==-1)]), axis=0)
+        if keep_ignored_labels:
+            selected_peaks = np.concatenate((hdbscan.medoids_, data[np.where(hdbscan.labels_==-1)]), axis=0)
         else:
             selected_peaks = hdbscan.medoids_
         confidence = []
         for peak in selected_peaks:
             confidence.append(peakdata[np.where((data == peak).all(axis=1))[0].item(), -1])
         return np.column_stack((selected_peaks, confidence))
-
-
-    def _do_kmeans_clustering(self, peakdata):
-        stdScaler = StandardScaler()
-        peakdata[:, 3] = stdScaler.fit_transform(peakdata[:, 3].reshape(-1,1)).flatten()
-        minmaxScaler = MinMaxScaler()
-        peakdata[:, 4] = minmaxScaler.fit_transform(peakdata[:, 4].reshape(-1, 1)).flatten()
-
-        WCSS = []
-        cluster_range = range(1, len(peakdata), 2)
-        for i in cluster_range:
-            model = KMeans(n_clusters = i, init = 'k-means++')
-            model.fit(peakdata)
-            WCSS.append(model.inertia_)
-
-        first_derivative = np.diff(WCSS, n=1)
-        elbow_point = np.argmax(first_derivative) + 1
-        print(f"Selected elbow point={elbow_point} for KMeans clustering")
-        finalmodel = KMeans(n_clusters = elbow_point, init = "k-means++", max_iter = 500, n_init = 10, random_state = 0)
-        finalmodel.fit_predict(peakdata)
-        print(f"Silhouette score of the clusters={silhouette_score(peakdata, finalmodel.labels_)}")
-        return finalmodel.cluster_centers_
-
+
 
     def _do_cnn_inferencing(self, workspace):
         data_set = WISHWorkspaceDataSet(workspace)

diff --git a/diffraction/WISH/bragg-detect/cnn/README.md b/diffraction/WISH/bragg-detect/cnn/README.md
@@ -7,12 +7,11 @@ Inorder to use the pre-trained Faster RCNN model inside mantid using an IDAaaS i
 * Launch Mantid workbench nightly from Applications->Software->Mantid->Mantid Workbench Nightly 
 * Download `scriptrepository\diffraction\WISH` directory from mantid's script repository as instructed here https://docs.mantidproject.org/nightly/workbench/scriptrepository.html
 * Check whether `<local path>\diffraction\WISH` path is listed under `Python Script Directories` tab from `File->Manage User Directories` of Mantid workbench.
-* Below is an example code snippet to test the code. It will create a peaks workspace with the inferred peaks from the cnn. The valid values for the clustering are QLab, HDBSCAN, KMeans.
+* Below is an example code snippet to test the code. It will create a peaks workspace with the inferred peaks from the cnn. The valid values for the clustering are QLab or HDBSCAN.
 ```python
 from cnn.BraggDetectCNN import BraggDetectCNN
 model_weights = r'/mnt/ceph/auxiliary/wish/BraggDetect_FasterRCNN_Resnet50_Weights_v1.pt'
 cnn_peaks_detector = BraggDetectCNN(model_weights_path=model_weights, batch_size=64)
-clustering_params = {"name":"QLab", "q_tol": 0.05}
-cnn_peaks_detector.find_bragg_peaks(workspace='WISH00042730', output_ws_name="CNN_Peaks", conf_threshold=0.0, **clustering_params)
+cnn_peaks_detector.find_bragg_peaks(workspace='WISH00042730', output_ws_name="CNN_Peaks", conf_threshold=0.0, clustering="QLab")
 ```
 * If the above import is not working, check whether the `<local path>\diffraction\WISH` path is listed under `Python Script Directories` tab from `File->Manage User Directories`.