From cc46520784375ddfb8ba84cc7249402117374e60 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Tue, 3 Sep 2024 09:27:48 +0530 Subject: [PATCH 01/10] added onehot encoder to handle unknown categorical values (which pd get dummies fail) --- prince/mca.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/prince/mca.py b/prince/mca.py index 67924c3c..8c7b87e9 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -5,6 +5,7 @@ import pandas as pd import sklearn.base import sklearn.utils +from sklearn.preprocessing import OneHotEncoder from prince import utils @@ -20,7 +21,9 @@ def __init__( check_input=True, random_state=None, engine="sklearn", - one_hot=True, + one_hot = False, #if True, use pd.get_dummies to one-hot encode the data + one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False), #OneHotEncoder object to use + is_one_hot_fitted = False ): super().__init__( n_components=n_components, @@ -31,10 +34,21 @@ def __init__( engine=engine, ) self.one_hot = one_hot + self.one_hot_encoder = one_hot_encoder + self.is_one_hot_fitted = is_one_hot_fitted + def _prepare(self, X): if self.one_hot: X = pd.get_dummies(X, columns=X.columns) + else: + if self.is_one_hot_fitted == False: + X = self.one_hot_encoder.fit_transform(X) + X = pd.DataFrame(X) + self.is_one_hot_fitted = True + else: + X = self.one_hot_encoder.transform(X) + X = pd.DataFrame(X) return X @utils.check_is_dataframe_input @@ -54,6 +68,7 @@ def fit(self, X, y=None): self.K_ = X.shape[1] # One-hot encode the data + print(X.shape) one_hot = self._prepare(X) # We need the number of columns to apply the Greenacre correction From 09d92d24be3cef5495f70661ea4acdf30e39581b Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Tue, 3 Sep 2024 20:45:42 +0530 Subject: [PATCH 02/10] modified code to support one_hot attribute and original get_dummies method | added description --- prince/mca.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/prince/mca.py b/prince/mca.py index 8c7b87e9..84172128 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -13,6 +13,14 @@ class MCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, ca.CA): + ''' + added new attributes to support one-hot encoding when handling unknown categories + + added attributes: + get_dummies: if True, use pd.get_dummies to one-hot encode the data + one_hot_encoder: OneHotEncoder object to use + is_one_hot_fitted: check if one_hot_encoder is fitted (set it to true if the one_hot_encoder is already fitted) + ''' def __init__( self, n_components=2, @@ -21,7 +29,8 @@ def __init__( check_input=True, random_state=None, engine="sklearn", - one_hot = False, #if True, use pd.get_dummies to one-hot encode the data + one_hot = True, + get_dummies = False,#if True, use pd.get_dummies to one-hot encode the data one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False), #OneHotEncoder object to use is_one_hot_fitted = False ): @@ -34,21 +43,23 @@ def __init__( engine=engine, ) self.one_hot = one_hot + self.get_dummies = get_dummies self.one_hot_encoder = one_hot_encoder self.is_one_hot_fitted = is_one_hot_fitted def _prepare(self, X): if self.one_hot: - X = pd.get_dummies(X, columns=X.columns) - else: - if self.is_one_hot_fitted == False: - X = self.one_hot_encoder.fit_transform(X) - X = pd.DataFrame(X) - self.is_one_hot_fitted = True + if self.get_dummies: + X = pd.get_dummies(X, columns=X.columns) else: - X = self.one_hot_encoder.transform(X) - X = pd.DataFrame(X) + if self.is_one_hot_fitted == False: + X = self.one_hot_encoder.fit_transform(X) + X = pd.DataFrame(X) + self.is_one_hot_fitted = True + else: + X = self.one_hot_encoder.transform(X) + X = pd.DataFrame(X) return X @utils.check_is_dataframe_input From 37e0f594a48da151231db7da8f0ee831e8184457 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Tue, 3 Sep 2024 21:02:05 +0530 Subject: [PATCH 03/10] fixed issue to get column names after using OneHotEncoder --- prince/mca.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/prince/mca.py b/prince/mca.py index 84172128..abf85c1e 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -52,14 +52,16 @@ def _prepare(self, X): if self.one_hot: if self.get_dummies: X = pd.get_dummies(X, columns=X.columns) + return X else: if self.is_one_hot_fitted == False: - X = self.one_hot_encoder.fit_transform(X) - X = pd.DataFrame(X) + X_enc = self.one_hot_encoder.fit_transform(X) + X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) self.is_one_hot_fitted = True + return X_enc else: - X = self.one_hot_encoder.transform(X) - X = pd.DataFrame(X) + X_enc = self.one_hot_encoder.transform(X) + X = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) return X @utils.check_is_dataframe_input From 77e0603670e3488e091f885816fe63a9a72075e3 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Tue, 3 Sep 2024 21:05:09 +0530 Subject: [PATCH 04/10] small issue in _prepare (didn't return the one-hot encoded values if the one_hot_encoder is fitted) --- prince/mca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prince/mca.py b/prince/mca.py index abf85c1e..84466662 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -61,7 +61,8 @@ def _prepare(self, X): return X_enc else: X_enc = self.one_hot_encoder.transform(X) - X = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) + X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) + return X_enc return X @utils.check_is_dataframe_input From 916607174b423c7cbde6422d69d5c70a542c8641 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Tue, 3 Sep 2024 21:34:57 +0530 Subject: [PATCH 05/10] updated the mca notebook in docs/content --- docs/content/mca.ipynb | 233 ++++++++++++++++++++++++----------------- 1 file changed, 137 insertions(+), 96 deletions(-) diff --git a/docs/content/mca.ipynb b/docs/content/mca.ipynb index ab86ae23..b9260124 100644 --- a/docs/content/mca.ipynb +++ b/docs/content/mca.ipynb @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:02.663561Z", @@ -168,7 +168,15 @@ "shell.execute_reply": "2023-10-11T22:33:03.033990Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(19, 5)\n" + ] + } + ], "source": [ "import prince\n", "\n", @@ -178,7 +186,9 @@ " copy=True,\n", " check_input=True,\n", " engine='sklearn',\n", - " random_state=42\n", + " random_state=42,\n", + " one_hot=True,\n", + " get_dummies=True\n", ")\n", "mca = mca.fit(dataset)" ] @@ -192,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.039813Z", @@ -201,7 +211,15 @@ "shell.execute_reply": "2023-10-11T22:33:03.066635Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(19, 10)\n" + ] + } + ], "source": [ "one_hot = pd.get_dummies(dataset)\n", "\n", @@ -219,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.080736Z", @@ -292,7 +310,7 @@ "2 0.186 18.56% 79.84%" ] }, - "execution_count": 4, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -311,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.111432Z", @@ -351,25 +369,25 @@ " \n", " 0\n", " 0.705387\n", - " 8.460396e-15\n", + " 5.369158e-15\n", " 0.758639\n", " \n", " \n", " 1\n", " -0.386586\n", - " 8.514287e-15\n", + " 5.724889e-15\n", " 0.626063\n", " \n", " \n", " 2\n", " -0.386586\n", - " 6.249235e-15\n", + " 4.807799e-15\n", " 0.626063\n", " \n", " \n", " 3\n", " -0.852014\n", - " 6.872889e-15\n", + " 5.108782e-15\n", " 0.562447\n", " \n", " \n", @@ -384,14 +402,14 @@ ], "text/plain": [ " 0 1 2\n", - "0 0.705387 8.460396e-15 0.758639\n", - "1 -0.386586 8.514287e-15 0.626063\n", - "2 -0.386586 6.249235e-15 0.626063\n", - "3 -0.852014 6.872889e-15 0.562447\n", + "0 0.705387 5.369158e-15 0.758639\n", + "1 -0.386586 5.724889e-15 0.626063\n", + "2 -0.386586 4.807799e-15 0.626063\n", + "3 -0.852014 5.108782e-15 0.562447\n", "4 0.783539 -6.333333e-01 0.130201" ] }, - "execution_count": 5, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -402,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.136728Z", @@ -466,7 +484,7 @@ " \n", " Action_DIP\n", " -0.853864\n", - " -1.953058e-15\n", + " -6.209900e-16\n", " -0.079340\n", " \n", " \n", @@ -479,10 +497,10 @@ "Color_YELLOW -0.130342 -7.657805e-01 0.712523\n", "Size_LARGE 0.117308 -6.892024e-01 -0.641270\n", "Size_SMALL -0.130342 7.657805e-01 0.712523\n", - "Action_DIP -0.853864 -1.953058e-15 -0.079340" + "Action_DIP -0.853864 -6.209900e-16 -0.079340" ] }, - "execution_count": 6, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -501,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.165704Z", @@ -511,17 +529,33 @@ } }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", + " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", + "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", + " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", + "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", + " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", + "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", + " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", + "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", + " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n" + ] + }, { "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 7, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -601,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.264339Z", @@ -616,54 +650,54 @@ "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 012012
07%0%16%07%0%16%
12%0%11%12%0%11%
22%0%11%22%0%11%
310%0%9%310%0%9%
48%10%0%48%10%0%
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 8, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -674,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.464586Z", @@ -689,54 +723,54 @@ "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 012012
Color_PURPLE0%24%23%Color_PURPLE0%24%23%
Color_YELLOW0%26%26%Color_YELLOW0%26%26%
Size_LARGE0%24%23%Size_LARGE0%24%23%
Size_SMALL0%26%26%Size_SMALL0%26%26%
Action_DIP15%0%0%Action_DIP15%0%0%
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -755,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 21, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.478001Z", @@ -795,25 +829,25 @@ " \n", " 0\n", " 0.461478\n", - " 6.638620e-29\n", + " 2.673675e-29\n", " 0.533786\n", " \n", " \n", " 1\n", " 0.152256\n", - " 7.385455e-29\n", + " 3.338988e-29\n", " 0.399316\n", " \n", " \n", " 2\n", " 0.152256\n", - " 3.978637e-29\n", + " 2.354904e-29\n", " 0.399316\n", " \n", " \n", " 3\n", " 0.653335\n", - " 4.251294e-29\n", + " 2.348969e-29\n", " 0.284712\n", " \n", " \n", @@ -828,14 +862,14 @@ ], "text/plain": [ " 0 1 2\n", - "0 0.461478 6.638620e-29 0.533786\n", - "1 0.152256 7.385455e-29 0.399316\n", - "2 0.152256 3.978637e-29 0.399316\n", - "3 0.653335 4.251294e-29 0.284712\n", + "0 0.461478 2.673675e-29 0.533786\n", + "1 0.152256 3.338988e-29 0.399316\n", + "2 0.152256 2.354904e-29 0.399316\n", + "3 0.653335 2.348969e-29 0.284712\n", "4 0.592606 3.871772e-01 0.016363" ] }, - "execution_count": 10, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -846,7 +880,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 22, "metadata": { "execution": { "iopub.execute_input": "2023-10-11T22:33:03.494732Z", @@ -910,7 +944,7 @@ " \n", " Action_DIP\n", " 0.530243\n", - " 2.774134e-30\n", + " 2.804572e-31\n", " 0.004578\n", " \n", " \n", @@ -923,10 +957,10 @@ "Color_YELLOW 0.015290 5.277778e-01 0.456920\n", "Size_LARGE 0.015290 5.277778e-01 0.456920\n", "Size_SMALL 0.015290 5.277778e-01 0.456920\n", - "Action_DIP 0.530243 2.774134e-30 0.004578" + "Action_DIP 0.530243 2.804572e-31 0.004578" ] }, - "execution_count": 11, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -934,6 +968,13 @@ "source": [ "mca.column_cosine_similarities(dataset).head()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -952,7 +993,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.14" }, "vscode": { "interpreter": { From f87c84326829268319616b5c68a77062d8b84453 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Sun, 8 Sep 2024 13:04:38 +0530 Subject: [PATCH 06/10] fixed an issue to handle unknown columns during one hot encoding for MCA analysis --- prince/mca.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/prince/mca.py b/prince/mca.py index 84466662..7abe9b76 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -31,7 +31,7 @@ def __init__( engine="sklearn", one_hot = True, get_dummies = False,#if True, use pd.get_dummies to one-hot encode the data - one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False), #OneHotEncoder object to use + one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=bool), #OneHotEncoder object to use is_one_hot_fitted = False ): super().__init__( @@ -55,14 +55,28 @@ def _prepare(self, X): return X else: if self.is_one_hot_fitted == False: + #if the one_hot_encoder is not fitted, to fit and also set the is_one_hot_fitted variable to True X_enc = self.one_hot_encoder.fit_transform(X) X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) self.is_one_hot_fitted = True return X_enc else: - X_enc = self.one_hot_encoder.transform(X) - X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) - return X_enc + #checking if the columns fed to the onehot encoder and the columns fitted to the onehot encoder are the same + oh_cols = set(self.one_hot_encoder.feature_names_in_.tolist()) + X_cols = set(X.columns.tolist()) + + if oh_cols == X_cols: + #if the fitted cols are the same as the inferencing columns, then can transform + X_enc = self.one_hot_encoder.transform(X) + X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) + return X_enc + else: + #if the fitted cols are different to the inferencing columns, then should fit the onehot encoder again, to handle unit tests + print(X_cols) + print(oh_cols) + X_enc = self.one_hot_encoder.fit_transform(X) + X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) + return X_enc return X @utils.check_is_dataframe_input From e63d74b162a42b3f2c44ca2903766fa992426ab7 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Sun, 8 Sep 2024 13:19:02 +0530 Subject: [PATCH 07/10] fixing merge issue in mca notebook in docs --- docs/content/mca.ipynb | 83 +++++++++++------------------------------- 1 file changed, 21 insertions(+), 62 deletions(-) diff --git a/docs/content/mca.ipynb b/docs/content/mca.ipynb index 9bb2de0a..4bb49112 100644 --- a/docs/content/mca.ipynb +++ b/docs/content/mca.ipynb @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:04.556009Z", @@ -168,15 +168,7 @@ "shell.execute_reply": "2024-09-07T18:18:05.003821Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(19, 5)\n" - ] - } - ], + "outputs": [], "source": [ "import prince\n", "\n", @@ -186,9 +178,7 @@ " copy=True,\n", " check_input=True,\n", " engine='sklearn',\n", - " random_state=42,\n", - " one_hot=True,\n", - " get_dummies=True\n", + " random_state=42\n", ")\n", "mca = mca.fit(dataset)" ] @@ -202,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.011574Z", @@ -211,15 +201,7 @@ "shell.execute_reply": "2024-09-07T18:18:05.036730Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(19, 10)\n" - ] - } - ], + "outputs": [], "source": [ "one_hot = pd.get_dummies(dataset)\n", "\n", @@ -237,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.042434Z", @@ -310,7 +292,7 @@ "2 0.186 18.56% 79.84%" ] }, - "execution_count": 15, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -329,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.067678Z", @@ -409,7 +391,7 @@ "4 0.783539 -6.333333e-01 0.130201" ] }, - "execution_count": 16, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -420,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.095132Z", @@ -500,7 +482,7 @@ "Action_DIP -0.853864 -2.712409e-15 -0.079340" ] }, - "execution_count": 17, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -519,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.127742Z", @@ -529,22 +511,6 @@ } }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", - " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", - "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", - " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", - "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", - " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", - "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", - " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", - "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", - " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n" - ] - }, { "data": { "text/html": [ @@ -608,7 +574,7 @@ "alt.LayerChart(...)" ] }, - "execution_count": 18, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -635,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.215554Z", @@ -697,7 +663,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -708,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.251107Z", @@ -770,7 +736,7 @@ "" ] }, - "execution_count": 20, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -789,7 +755,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.274947Z", @@ -869,7 +835,7 @@ "4 0.592606 3.871772e-01 0.016363" ] }, - "execution_count": 21, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -880,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-09-07T18:18:05.304890Z", @@ -960,7 +926,7 @@ "Action_DIP 0.530243 5.350665e-30 0.004578" ] }, - "execution_count": 22, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -968,13 +934,6 @@ "source": [ "mca.column_cosine_similarities(dataset).head()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 3bcff9c2b5897492a39f53d38572dda8ef5959ef Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Sun, 8 Sep 2024 13:25:52 +0530 Subject: [PATCH 08/10] removed code lines kept for debugging --- prince/mca.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/prince/mca.py b/prince/mca.py index 7abe9b76..f506328a 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -72,8 +72,6 @@ def _prepare(self, X): return X_enc else: #if the fitted cols are different to the inferencing columns, then should fit the onehot encoder again, to handle unit tests - print(X_cols) - print(oh_cols) X_enc = self.one_hot_encoder.fit_transform(X) X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns)) return X_enc From bfc91790bb84dc2aa54d487014831d33bf334ec0 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Sun, 8 Sep 2024 20:35:17 +0530 Subject: [PATCH 09/10] 2 errors caused by print code for logging --- prince/mca.py | 1 - 1 file changed, 1 deletion(-) diff --git a/prince/mca.py b/prince/mca.py index f506328a..8d7b8492 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -94,7 +94,6 @@ def fit(self, X, y=None): self.K_ = X.shape[1] # One-hot encode the data - print(X.shape) one_hot = self._prepare(X) # We need the number of columns to apply the Greenacre correction From ef68f6b77d5eb791d0113bd956d3625093899317 Mon Sep 17 00:00:00 2001 From: Vaseekaran Varatharajah Date: Sun, 8 Sep 2024 21:28:49 +0530 Subject: [PATCH 10/10] fixed a clean code issue --- prince/mca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prince/mca.py b/prince/mca.py index 8d7b8492..3eef94a0 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -54,7 +54,7 @@ def _prepare(self, X): X = pd.get_dummies(X, columns=X.columns) return X else: - if self.is_one_hot_fitted == False: + if self.is_one_hot_fitted is False: #if the one_hot_encoder is not fitted, to fit and also set the is_one_hot_fitted variable to True X_enc = self.one_hot_encoder.fit_transform(X) X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))