kwargs parameter added for clustering

mantidproject · Jan 2, 2025 · 661c29e · 661c29e
1 parent fa55698
commit 661c29e
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 17 deletions.
diff --git a/diffraction/WISH/bragg-detect/cnn/BraggDetectCNN.py b/diffraction/WISH/bragg-detect/cnn/BraggDetectCNN.py
@@ -47,59 +47,71 @@ def __init__(self, model_weights_path, batch_size=64, workers=0, iou_threshold=0
         self.iou_threshold = iou_threshold
 
 
-    def find_bragg_peaks(self, workspace, output_ws_name="CNN_Peaks", conf_threshold=0.0, clustering=Clustering.QLab.name, q_tol=0.05):
+    def find_bragg_peaks(self, workspace, output_ws_name="CNN_Peaks", conf_threshold=0.0, **kwargs):
         """
         Find bragg peaks using the pre trained FasterRCNN model and create a peaks workspace
         :param workspace: Workspace name or the object of Workspace from WISH, ex: "WISH0042730"
         :param output_ws_name: Name of the peaks workspace
         :param conf_threshold: Confidence threshold to filter peaks inferred from RCNN
-        :param clustering: Clustering method to filter and merge the peaks ex: QLab, HDBSCAN, KMeans
-        :param q_tol: QLab tolerance to remove duplicate peaks, it will onlye be useful when clustering is QLab
+        :param kwargs: variable keyword params for clustering. default is {"name": "Qlab", "q_tol": 0.05} 
+            Ex: {"name": "HDBSCAN", "keep_ignored_labels": True}
         """
+        clustering_params = {"name": "QLab", "q_tol": 0.05 }
+        clustering_params.update(kwargs)
+
         start_time = time.time()
         data_set, predicted_indices = self._do_cnn_inferencing(workspace)
 
         filtered_indices = predicted_indices[predicted_indices[:, -1] > conf_threshold]
 
         #Do Clustering
-        print(f"Starting peak clustering with {clustering} method..")
-        clustered_peaks = self._do_peak_clustering(filtered_indices, clustering)
+        print(f"Starting peak clustering with { clustering_params['name'] } method..")
+        clustered_peaks = self._do_peak_clustering(filtered_indices, clustering_params)
         print(f"Number of peaks after clustering is={len(clustered_peaks)}")
 
         cluster_indices_rounded = np.round(clustered_peaks[:, :3]).astype(int)
         peaksws = createPeaksWorkspaceFromIndices(data_set.get_workspace(), output_ws_name, cluster_indices_rounded, data_set.get_ws_as_3d_array())
         for ipk, pk in enumerate(peaksws):
             pk.setIntensity(clustered_peaks[ipk, -1])
 
-        if clustering == Clustering.QLab.name:
+        if clustering_params["name"] == Clustering.QLab.name:
             #Filter peaks by qlab
-            BaseSX.remove_duplicate_peaks_by_qlab(peaksws, q_tol)
+            BaseSX.remove_duplicate_peaks_by_qlab(peaksws, clustering_params["q_tol"])
 
         data_set.delete_rebunched_ws()
         print(f"Bragg peaks finding from FasterRCNN model is completed in {time.time()-start_time} seconds!")
 
 
-    def _do_peak_clustering(self, detected_peaks, clustering):
+    def _do_peak_clustering(self, detected_peaks, params):
         print(f"Number of peaks before clustering={len(detected_peaks)}")
-        if clustering == Clustering.HDBSCAN.name:
-            return self._do_hdbscan_clustering(detected_peaks)
-        elif clustering == Clustering.KMeans.name:
+        if params["name"] == Clustering.HDBSCAN.name:
+            return self._do_hdbscan_clustering(detected_peaks, params)
+        elif params["name"] == Clustering.KMeans.name:
             return self._do_kmeans_clustering(detected_peaks)
         else:
             return detected_peaks
 
 
-    def _do_hdbscan_clustering(self, peakdata):
+    def _do_hdbscan_clustering(self, peakdata, params):
         data = np.delete(peakdata, [3,4], axis=1)
 
-        hdbscan = HDBSCAN(min_cluster_size=2, store_centers="medoid")
+        hdbscan = HDBSCAN(min_cluster_size=2, 
+                          min_samples=2, 
+                          store_centers="medoid", 
+                          algorithm="auto", 
+                          cluster_selection_method="eom", 
+                          metric="euclidean")
         hdbscan.fit(data)
         print(f"Silhouette score of the clusters={silhouette_score(data, hdbscan.labels_)}")
 
+        if ("keep_ignored_labels" in params) and params["keep_ignored_labels"]:
+                selected_peaks = np.concatenate((hdbscan.medoids_, data[np.where(hdbscan.labels_==-1)]), axis=0)
+        else:
+            selected_peaks = hdbscan.medoids_
         confidence = []
-        for medoid in hdbscan.medoids_:
-            confidence.append(peakdata[np.where((data == medoid).all(axis=1))[0].item(), -1])
-        return np.column_stack((hdbscan.medoids_, confidence))
+        for peak in selected_peaks:
+            confidence.append(peakdata[np.where((data == peak).all(axis=1))[0].item(), -1])
+        return np.column_stack((selected_peaks, confidence))
 
 
     def _do_kmeans_clustering(self, peakdata):

diff --git a/diffraction/WISH/bragg-detect/cnn/README.md b/diffraction/WISH/bragg-detect/cnn/README.md
@@ -12,6 +12,7 @@ Inorder to use the pre-trained Faster RCNN model inside mantid using an IDAaaS i
 from cnn.BraggDetectCNN import BraggDetectCNN
 model_weights = r'/mnt/ceph/auxiliary/wish/BraggDetect_FasterRCNN_Resnet50_Weights_v1.pt'
 cnn_peaks_detector = BraggDetectCNN(model_weights_path=model_weights, batch_size=64)
-cnn_peaks_detector.find_bragg_peaks(workspace='WISH00042730', output_ws_name="CNN_Peaks", conf_threshold=0.0, clustering="QLab", q_tol=0.05)
+clustering_params = {"name":"QLab", "q_tol": 0.05}
+cnn_peaks_detector.find_bragg_peaks(workspace='WISH00042730', output_ws_name="CNN_Peaks", conf_threshold=0.0, **clustering_params)
 ```
 * If the above import is not working, check whether the `<local path>\diffraction\WISH` path is listed under `Python Script Directories` tab from `File->Manage User Directories`.