From 249888fc3475e42a3bfe872be0e9a3870b1e6220 Mon Sep 17 00:00:00 2001 From: TarikExner Date: Mon, 15 Apr 2024 20:15:59 +0200 Subject: [PATCH 1/4] fixed reproducibility issue due to missing seed in the SOMEstimator class --- src/FlowSOM/models/som_estimator.py | 5 +++++ tests/models/test_FlowSOMModel.py | 17 +++++++++++++++++ tests/models/test_SOMModel.py | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/src/FlowSOM/models/som_estimator.py b/src/FlowSOM/models/som_estimator.py index e313e68..9ca9724 100644 --- a/src/FlowSOM/models/som_estimator.py +++ b/src/FlowSOM/models/som_estimator.py @@ -1,5 +1,6 @@ import igraph as ig import numpy as np +from numba import jit from scipy.spatial.distance import cdist, pdist, squareform from sklearn.utils.validation import check_is_fitted @@ -74,6 +75,10 @@ def fit( # Initialize the grid grid = [(x, y) for x in range(xdim) for y in range(ydim)] n_codes = len(grid) + + if self.seed is not None: + np.random.seed(self.seed) + if codes is None: if init: codes = self.initf(X, xdim, ydim) diff --git a/tests/models/test_FlowSOMModel.py b/tests/models/test_FlowSOMModel.py index b5be7dc..c9715da 100644 --- a/tests/models/test_FlowSOMModel.py +++ b/tests/models/test_FlowSOMModel.py @@ -15,3 +15,20 @@ def test_clustering_v_measure(X_and_y): y_pred = som.fit_predict(X) score = v_measure_score(y_true, y_pred) assert score > 0.7 + +def test_reproducibility_no_seed(X): + fsom_1 = FlowSOMEstimator(cluster_kwargs={}, metacluster_kwargs={"n_clusters": 10}) + fsom_2 = FlowSOMEstimator(cluster_kwargs={}, metacluster_kwargs={"n_clusters": 10}) + y_pred_1 = fsom_1.fit_predict(X) + y_pred_2 = fsom_2.fit_predict(X) + + assert not all(y_pred_1 == y_pred_2) + +def test_reproducibility_seed(X): + fsom_1 = FlowSOMEstimator(cluster_kwargs={"seed": 0}, metacluster_kwargs={"n_clusters": 10}) + fsom_2 = FlowSOMEstimator(cluster_kwargs={"seed": 0}, metacluster_kwargs={"n_clusters": 10}) + y_pred_1 = fsom_1.fit_predict(X) + y_pred_2 = fsom_2.fit_predict(X) + + assert all(y_pred_1 == y_pred_2) + diff --git a/tests/models/test_SOMModel.py b/tests/models/test_SOMModel.py index 5235b68..f177b7a 100644 --- a/tests/models/test_SOMModel.py +++ b/tests/models/test_SOMModel.py @@ -15,3 +15,23 @@ def test_clustering_v_measure(X_and_y): y_pred = som.fit_predict(X) score = v_measure_score(y_true, y_pred) assert score > 0.7 + + +def test_reproducibility_no_seed(X): + som_1 = SOMEstimator(seed = None) + som_2 = SOMEstimator(seed = None) + codes_1 = som_1.fit(X).codes.flatten() + codes_2 = som_2.fit(X).codes.flatten() + + assert not all(codes_1 == codes_2) + + +def test_reproducibility_seed(X): + som_1 = SOMEstimator(seed = 1) + som_2 = SOMEstimator(seed = 1) + codes_1 = som_1.fit(X).codes.flatten() + codes_2 = som_2.fit(X).codes.flatten() + + assert all(codes_1 == codes_2) + + From 5e20f06749f4aab8f7ad8ed7433881f9dce89e5c Mon Sep 17 00:00:00 2001 From: TarikExner Date: Mon, 15 Apr 2024 20:50:45 +0200 Subject: [PATCH 2/4] removed unnecessary import for jit decorator --- src/FlowSOM/models/som_estimator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/FlowSOM/models/som_estimator.py b/src/FlowSOM/models/som_estimator.py index 9ca9724..e08265e 100644 --- a/src/FlowSOM/models/som_estimator.py +++ b/src/FlowSOM/models/som_estimator.py @@ -1,6 +1,5 @@ import igraph as ig import numpy as np -from numba import jit from scipy.spatial.distance import cdist, pdist, squareform from sklearn.utils.validation import check_is_fitted From 0f3695d718be4f00e7373e940fed1e132d20f952 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Tue, 16 Apr 2024 17:18:41 +0200 Subject: [PATCH 3/4] adapt to new parameters --- tests/models/test_FlowSOMModel.py | 11 ++++++----- tests/models/test_SOMModel.py | 12 +++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tests/models/test_FlowSOMModel.py b/tests/models/test_FlowSOMModel.py index 23569f2..dacee07 100644 --- a/tests/models/test_FlowSOMModel.py +++ b/tests/models/test_FlowSOMModel.py @@ -16,19 +16,20 @@ def test_clustering_v_measure(X_and_y): score = v_measure_score(y_true, y_pred) assert score > 0.7 + def test_reproducibility_no_seed(X): - fsom_1 = FlowSOMEstimator(cluster_kwargs={}, metacluster_kwargs={"n_clusters": 10}) - fsom_2 = FlowSOMEstimator(cluster_kwargs={}, metacluster_kwargs={"n_clusters": 10}) + fsom_1 = FlowSOMEstimator(n_clusters=10) + fsom_2 = FlowSOMEstimator(n_clusters=10) y_pred_1 = fsom_1.fit_predict(X) y_pred_2 = fsom_2.fit_predict(X) assert not all(y_pred_1 == y_pred_2) + def test_reproducibility_seed(X): - fsom_1 = FlowSOMEstimator(cluster_kwargs={"seed": 0}, metacluster_kwargs={"n_clusters": 10}) - fsom_2 = FlowSOMEstimator(cluster_kwargs={"seed": 0}, metacluster_kwargs={"n_clusters": 10}) + fsom_1 = FlowSOMEstimator(n_clusters=10, seed=0) + fsom_2 = FlowSOMEstimator(n_clusters=10, seed=0) y_pred_1 = fsom_1.fit_predict(X) y_pred_2 = fsom_2.fit_predict(X) assert all(y_pred_1 == y_pred_2) - diff --git a/tests/models/test_SOMModel.py b/tests/models/test_SOMModel.py index f177b7a..2e5eb33 100644 --- a/tests/models/test_SOMModel.py +++ b/tests/models/test_SOMModel.py @@ -10,7 +10,7 @@ def test_clustering(X): def test_clustering_v_measure(X_and_y): - som = SOMEstimator() + som = SOMEstimator(seed=1) X, y_true = X_and_y y_pred = som.fit_predict(X) score = v_measure_score(y_true, y_pred) @@ -18,8 +18,8 @@ def test_clustering_v_measure(X_and_y): def test_reproducibility_no_seed(X): - som_1 = SOMEstimator(seed = None) - som_2 = SOMEstimator(seed = None) + som_1 = SOMEstimator(seed=None) + som_2 = SOMEstimator(seed=None) codes_1 = som_1.fit(X).codes.flatten() codes_2 = som_2.fit(X).codes.flatten() @@ -27,11 +27,9 @@ def test_reproducibility_no_seed(X): def test_reproducibility_seed(X): - som_1 = SOMEstimator(seed = 1) - som_2 = SOMEstimator(seed = 1) + som_1 = SOMEstimator(seed=1) + som_2 = SOMEstimator(seed=1) codes_1 = som_1.fit(X).codes.flatten() codes_2 = som_2.fit(X).codes.flatten() assert all(codes_1 == codes_2) - - From 343f228e3df634560b9c49bbd5a325418dce8e5f Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Tue, 16 Apr 2024 17:24:02 +0200 Subject: [PATCH 4/4] Change FlowSOM input typing, add seed parameter --- src/FlowSOM/main.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/FlowSOM/main.py b/src/FlowSOM/main.py index 695592d..18c4b3d 100644 --- a/src/FlowSOM/main.py +++ b/src/FlowSOM/main.py @@ -14,6 +14,7 @@ from sklearn.base import check_is_fitted from flowsom.io import read_csv, read_FCS +from flowsom.models.base_flowsom_estimator import BaseFlowSOMEstimator from flowsom.models.flowsom_estimator import FlowSOMEstimator from flowsom.tl import get_channels, get_markers @@ -24,37 +25,31 @@ class FlowSOM: def __init__( self, inp, - n_clusters, - cols_to_use=None, - model=FlowSOMEstimator, - xdim=10, - ydim=10, - rlen=10, - mst=1, - alpha=(0.05, 0.01), + n_clusters: int, + cols_to_use: np.ndarray | None = None, + model: type[BaseFlowSOMEstimator] = FlowSOMEstimator, + xdim: int = 10, + ydim: int = 10, + rlen: int = 10, + mst: int = 1, + alpha: tuple[float, float] = (0.05, 0.01), + seed: int | None = None, mad_allowed=4, **kwargs, ): """Initialize the FlowSOM AnnData object. + :param inp: An AnnData or filepath to an FCS file :param n_clusters: The number of clusters - :type n_clusters: int :param xdim: The x dimension of the SOM - :type xdim: int :param ydim: The y dimension of the SOM - :type ydim: int :param rlen: Number of times to loop over the training data for each MST - :type rlen: int :param mst: Number of times to loop over the training data for each MST - :type mst: int :param alpha: The learning rate - :type alpha: tuple + :param seed: The random seed to use :param cols_to_use: The columns to use for clustering - :type cols_to_use: np.array :param mad_allowed: Number of median absolute deviations allowed - :type mad_allowed: int :param model: The model to use - :type model: FlowSOMEstimator :param kwargs: Additional keyword arguments. See documentation of the cluster_model and metacluster_model for more information. :type kwargs: dict """ @@ -66,6 +61,7 @@ def __init__( self.rlen = rlen self.mst = mst self.alpha = alpha + self.seed = seed # metacluster model params self.n_clusters = n_clusters @@ -75,6 +71,7 @@ def __init__( rlen=rlen, mst=mst, alpha=alpha, + seed=seed, n_clusters=n_clusters, **kwargs, )