From cc46520784375ddfb8ba84cc7249402117374e60 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Tue, 3 Sep 2024 09:27:48 +0530
Subject: [PATCH 01/10] added onehot encoder to handle unknown categorical
 values (which pd get dummies fail)

---
 prince/mca.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/prince/mca.py b/prince/mca.py
index 67924c3c..8c7b87e9 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import sklearn.base
 import sklearn.utils
+from sklearn.preprocessing import OneHotEncoder
 
 from prince import utils
 
@@ -20,7 +21,9 @@ def __init__(
         check_input=True,
         random_state=None,
         engine="sklearn",
-        one_hot=True,
+        one_hot = False, #if True, use pd.get_dummies to one-hot encode the data
+        one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False), #OneHotEncoder object to use
+        is_one_hot_fitted = False
     ):
         super().__init__(
             n_components=n_components,
@@ -31,10 +34,21 @@ def __init__(
             engine=engine,
         )
         self.one_hot = one_hot
+        self.one_hot_encoder = one_hot_encoder
+        self.is_one_hot_fitted = is_one_hot_fitted
+
 
     def _prepare(self, X):
         if self.one_hot:
             X = pd.get_dummies(X, columns=X.columns)
+        else:
+            if self.is_one_hot_fitted == False:
+                X = self.one_hot_encoder.fit_transform(X)
+                X = pd.DataFrame(X)
+                self.is_one_hot_fitted = True
+            else:
+                X = self.one_hot_encoder.transform(X)
+                X = pd.DataFrame(X)
         return X
 
     @utils.check_is_dataframe_input
@@ -54,6 +68,7 @@ def fit(self, X, y=None):
         self.K_ = X.shape[1]
 
         # One-hot encode the data
+        print(X.shape)
         one_hot = self._prepare(X)
 
         # We need the number of columns to apply the Greenacre correction

From 09d92d24be3cef5495f70661ea4acdf30e39581b Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Tue, 3 Sep 2024 20:45:42 +0530
Subject: [PATCH 02/10] modified code to support one_hot attribute and original
 get_dummies method | added description

---
 prince/mca.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/prince/mca.py b/prince/mca.py
index 8c7b87e9..84172128 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -13,6 +13,14 @@
 
 
 class MCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin, ca.CA):
+    '''
+    added new attributes to support one-hot encoding when handling unknown categories
+    
+    added attributes:
+        get_dummies: if True, use pd.get_dummies to one-hot encode the data
+        one_hot_encoder: OneHotEncoder object to use
+        is_one_hot_fitted: check if one_hot_encoder is fitted (set it to true if the one_hot_encoder is already fitted)
+    '''
     def __init__(
         self,
         n_components=2,
@@ -21,7 +29,8 @@ def __init__(
         check_input=True,
         random_state=None,
         engine="sklearn",
-        one_hot = False, #if True, use pd.get_dummies to one-hot encode the data
+        one_hot = True,
+        get_dummies = False,#if True, use pd.get_dummies to one-hot encode the data
         one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False), #OneHotEncoder object to use
         is_one_hot_fitted = False
     ):
@@ -34,21 +43,23 @@ def __init__(
             engine=engine,
         )
         self.one_hot = one_hot
+        self.get_dummies = get_dummies
         self.one_hot_encoder = one_hot_encoder
         self.is_one_hot_fitted = is_one_hot_fitted
 
 
     def _prepare(self, X):
         if self.one_hot:
-            X = pd.get_dummies(X, columns=X.columns)
-        else:
-            if self.is_one_hot_fitted == False:
-                X = self.one_hot_encoder.fit_transform(X)
-                X = pd.DataFrame(X)
-                self.is_one_hot_fitted = True
+            if self.get_dummies:
+                X = pd.get_dummies(X, columns=X.columns)
             else:
-                X = self.one_hot_encoder.transform(X)
-                X = pd.DataFrame(X)
+                if self.is_one_hot_fitted == False:
+                    X = self.one_hot_encoder.fit_transform(X)
+                    X = pd.DataFrame(X)
+                    self.is_one_hot_fitted = True
+                else:
+                    X = self.one_hot_encoder.transform(X)
+                    X = pd.DataFrame(X)
         return X
 
     @utils.check_is_dataframe_input

From 37e0f594a48da151231db7da8f0ee831e8184457 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Tue, 3 Sep 2024 21:02:05 +0530
Subject: [PATCH 03/10] fixed issue to get column names after using
 OneHotEncoder

---
 prince/mca.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/prince/mca.py b/prince/mca.py
index 84172128..abf85c1e 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -52,14 +52,16 @@ def _prepare(self, X):
         if self.one_hot:
             if self.get_dummies:
                 X = pd.get_dummies(X, columns=X.columns)
+                return X
             else:
                 if self.is_one_hot_fitted == False:
-                    X = self.one_hot_encoder.fit_transform(X)
-                    X = pd.DataFrame(X)
+                    X_enc = self.one_hot_encoder.fit_transform(X)
+                    X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
                     self.is_one_hot_fitted = True
+                    return X_enc
                 else:
-                    X = self.one_hot_encoder.transform(X)
-                    X = pd.DataFrame(X)
+                    X_enc = self.one_hot_encoder.transform(X)
+                    X = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
         return X
 
     @utils.check_is_dataframe_input

From 77e0603670e3488e091f885816fe63a9a72075e3 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Tue, 3 Sep 2024 21:05:09 +0530
Subject: [PATCH 04/10] small issue in _prepare (didn't return the one-hot
 encoded values if the one_hot_encoder is fitted)

---
 prince/mca.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/prince/mca.py b/prince/mca.py
index abf85c1e..84466662 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -61,7 +61,8 @@ def _prepare(self, X):
                     return X_enc
                 else:
                     X_enc = self.one_hot_encoder.transform(X)
-                    X = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
+                    X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
+                    return X_enc
         return X
 
     @utils.check_is_dataframe_input

From 916607174b423c7cbde6422d69d5c70a542c8641 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Tue, 3 Sep 2024 21:34:57 +0530
Subject: [PATCH 05/10] updated the mca notebook in docs/content

---
 docs/content/mca.ipynb | 233 ++++++++++++++++++++++++-----------------
 1 file changed, 137 insertions(+), 96 deletions(-)

diff --git a/docs/content/mca.ipynb b/docs/content/mca.ipynb
index ab86ae23..b9260124 100644
--- a/docs/content/mca.ipynb
+++ b/docs/content/mca.ipynb
@@ -159,7 +159,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:02.663561Z",
@@ -168,7 +168,15 @@
      "shell.execute_reply": "2023-10-11T22:33:03.033990Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(19, 5)\n"
+     ]
+    }
+   ],
    "source": [
     "import prince\n",
     "\n",
@@ -178,7 +186,9 @@
     "    copy=True,\n",
     "    check_input=True,\n",
     "    engine='sklearn',\n",
-    "    random_state=42\n",
+    "    random_state=42,\n",
+    "    one_hot=True,\n",
+    "    get_dummies=True\n",
     ")\n",
     "mca = mca.fit(dataset)"
    ]
@@ -192,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 14,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.039813Z",
@@ -201,7 +211,15 @@
      "shell.execute_reply": "2023-10-11T22:33:03.066635Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(19, 10)\n"
+     ]
+    }
+   ],
    "source": [
     "one_hot = pd.get_dummies(dataset)\n",
     "\n",
@@ -219,7 +237,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 15,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.080736Z",
@@ -292,7 +310,7 @@
        "2              0.186        18.56%                     79.84%"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -311,7 +329,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 16,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.111432Z",
@@ -351,25 +369,25 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.705387</td>\n",
-       "      <td>8.460396e-15</td>\n",
+       "      <td>5.369158e-15</td>\n",
        "      <td>0.758639</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>-0.386586</td>\n",
-       "      <td>8.514287e-15</td>\n",
+       "      <td>5.724889e-15</td>\n",
        "      <td>0.626063</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>-0.386586</td>\n",
-       "      <td>6.249235e-15</td>\n",
+       "      <td>4.807799e-15</td>\n",
        "      <td>0.626063</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>-0.852014</td>\n",
-       "      <td>6.872889e-15</td>\n",
+       "      <td>5.108782e-15</td>\n",
        "      <td>0.562447</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -384,14 +402,14 @@
       ],
       "text/plain": [
        "          0             1         2\n",
-       "0  0.705387  8.460396e-15  0.758639\n",
-       "1 -0.386586  8.514287e-15  0.626063\n",
-       "2 -0.386586  6.249235e-15  0.626063\n",
-       "3 -0.852014  6.872889e-15  0.562447\n",
+       "0  0.705387  5.369158e-15  0.758639\n",
+       "1 -0.386586  5.724889e-15  0.626063\n",
+       "2 -0.386586  4.807799e-15  0.626063\n",
+       "3 -0.852014  5.108782e-15  0.562447\n",
        "4  0.783539 -6.333333e-01  0.130201"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -402,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 17,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.136728Z",
@@ -466,7 +484,7 @@
        "    <tr>\n",
        "      <th>Action_DIP</th>\n",
        "      <td>-0.853864</td>\n",
-       "      <td>-1.953058e-15</td>\n",
+       "      <td>-6.209900e-16</td>\n",
        "      <td>-0.079340</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -479,10 +497,10 @@
        "Color_YELLOW -0.130342 -7.657805e-01  0.712523\n",
        "Size_LARGE    0.117308 -6.892024e-01 -0.641270\n",
        "Size_SMALL   -0.130342  7.657805e-01  0.712523\n",
-       "Action_DIP   -0.853864 -1.953058e-15 -0.079340"
+       "Action_DIP   -0.853864 -6.209900e-16 -0.079340"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -501,7 +519,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 18,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.165704Z",
@@ -511,17 +529,33 @@
     }
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
+      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
+      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
+      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
+      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
+      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
+      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
+      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
+      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
+      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
        "\n",
-       "<div id=\"altair-viz-561283d185444e1ca6854352561fc5c1\"></div>\n",
+       "<div id=\"altair-viz-23aa94dca9904c75b7465910e120ca1f\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-561283d185444e1ca6854352561fc5c1\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-561283d185444e1ca6854352561fc5c1\");\n",
+       "    if (outputDiv.id !== \"altair-viz-23aa94dca9904c75b7465910e120ca1f\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-23aa94dca9904c75b7465910e120ca1f\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
@@ -567,14 +601,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"data\": {\"name\": \"data-b96b35e13a412719b5c57ab7724e48b4\"}, \"mark\": {\"type\": \"circle\", \"size\": 50}, \"encoding\": {\"color\": {\"field\": \"variable\", \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"variable\", \"type\": \"nominal\"}, {\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"component 0\", \"type\": \"quantitative\"}, {\"field\": \"component 1\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"title\": \"component 0 \\u2014 40.17%\"}, \"field\": \"component 0\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"component 1 \\u2014 21.11%\"}, \"field\": \"component 1\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}}, \"selection\": {\"selector001\": {\"type\": \"interval\", \"bind\": \"scales\", \"encodings\": [\"x\", \"y\"]}}}, {\"data\": {\"name\": \"data-d21372c86e9221ca8b3f1ee9ece403fe\"}, \"mark\": {\"type\": \"circle\", \"size\": 50}, \"encoding\": {\"color\": {\"field\": \"variable\", \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"variable\", \"type\": \"nominal\"}, {\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"component 0\", \"type\": \"quantitative\"}, {\"field\": \"component 1\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"title\": \"component 0 \\u2014 40.17%\"}, \"field\": \"component 0\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"component 1 \\u2014 21.11%\"}, \"field\": \"component 1\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}}}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-b96b35e13a412719b5c57ab7724e48b4\": [{\"component 0\": 0.7053867996248326, \"component 1\": 8.460396430365158e-15, \"component 2\": 0.7586391105690962, \"variable\": \"row\", \"value\": \"0\", \"label\": 0}, {\"component 0\": -0.38658629949599044, \"component 1\": 8.514287191452671e-15, \"component 2\": 0.6260630816840032, \"variable\": \"row\", \"value\": \"1\", \"label\": 1}, {\"component 0\": -0.38658629949599044, \"component 1\": 6.249235059629217e-15, \"component 2\": 0.6260630816840009, \"variable\": \"row\", \"value\": \"2\", \"label\": 2}, {\"component 0\": -0.8520140574664055, \"component 1\": 6.872888737843351e-15, \"component 2\": 0.5624474892356494, \"variable\": \"row\", \"value\": \"3\", \"label\": 3}, {\"component 0\": 0.7835387510478181, \"component 1\": -0.6333333333333322, \"component 2\": 0.13020069134918916, \"variable\": \"row\", \"value\": \"4\", \"label\": 4}, {\"component 0\": 0.7835387510478181, \"component 1\": -0.6333333333333322, \"component 2\": 0.13020069134918916, \"variable\": \"row\", \"value\": \"5\", \"label\": 5}, {\"component 0\": -0.30843434807300507, \"component 1\": -0.6333333333333322, \"component 2\": -0.00237533753590394, \"variable\": \"row\", \"value\": \"6\", \"label\": 6}, {\"component 0\": -0.308434348073005, \"component 1\": -0.6333333333333344, \"component 2\": -0.0023753375359061813, \"variable\": \"row\", \"value\": \"7\", \"label\": 7}, {\"component 0\": -0.7738621060434201, \"component 1\": -0.6333333333333339, \"component 2\": -0.06599092998425776, \"variable\": \"row\", \"value\": \"8\", \"label\": 8}, {\"component 0\": 0.7835387510478186, \"component 1\": 0.6333333333333346, \"component 2\": 0.1302006913491744, \"variable\": \"row\", \"value\": \"9\", \"label\": 9}, {\"component 0\": 0.7835387510478186, \"component 1\": 0.6333333333333346, \"component 2\": 0.1302006913491744, \"variable\": \"row\", \"value\": \"10\", \"label\": 10}, {\"component 0\": -0.30843434807300457, \"component 1\": 0.6333333333333346, \"component 2\": -0.002375337535918699, \"variable\": \"row\", \"value\": \"11\", \"label\": 11}, {\"component 0\": -0.30843434807300457, \"component 1\": 0.6333333333333324, \"component 2\": -0.0023753375359209404, \"variable\": \"row\", \"value\": \"12\", \"label\": 12}, {\"component 0\": -0.7738621060434197, \"component 1\": 0.633333333333333, \"component 2\": -0.06599092998427251, \"variable\": \"row\", \"value\": \"13\", \"label\": 13}, {\"component 0\": 0.8616907024708039, \"component 1\": -6.024166031086946e-15, \"component 2\": -0.49823772787073267, \"variable\": \"row\", \"value\": \"14\", \"label\": 14}, {\"component 0\": 0.8616907024708039, \"component 1\": -6.024166031086946e-15, \"component 2\": -0.49823772787073267, \"variable\": \"row\", \"value\": \"15\", \"label\": 15}, {\"component 0\": -0.2302823966500192, \"component 1\": -5.9702752699994325e-15, \"component 2\": -0.6308137567558257, \"variable\": \"row\", \"value\": \"16\", \"label\": 16}, {\"component 0\": -0.23028239665001915, \"component 1\": -8.235327401822889e-15, \"component 2\": -0.630813756755828, \"variable\": \"row\", \"value\": \"17\", \"label\": 17}, {\"component 0\": -0.6957101546204342, \"component 1\": -7.611673723608755e-15, \"component 2\": -0.6944293492041795, \"variable\": \"row\", \"value\": \"18\", \"label\": 18}], \"data-d21372c86e9221ca8b3f1ee9ece403fe\": [{\"component 0\": 0.1173076067719154, \"component 1\": 0.689202437604504, \"component 2\": -0.6412704755837075, \"variable\": \"column\", \"value\": \"Color_PURPLE\", \"label\": \"Color_PURPLE\"}, {\"component 0\": -0.13034178530212806, \"component 1\": -0.7657804862272266, \"component 2\": 0.7125227506485641, \"variable\": \"column\", \"value\": \"Color_YELLOW\", \"label\": \"Color_YELLOW\"}, {\"component 0\": 0.11730760677191474, \"component 1\": -0.6892024376045184, \"component 2\": -0.6412704755836931, \"variable\": \"column\", \"value\": \"Size_LARGE\", \"label\": \"Size_LARGE\"}, {\"component 0\": -0.13034178530212734, \"component 1\": 0.7657804862272426, \"component 2\": 0.712522750648548, \"variable\": \"column\", \"value\": \"Size_SMALL\", \"label\": \"Size_SMALL\"}, {\"component 0\": -0.8538641988881547, \"component 1\": -1.95305767206435e-15, \"component 2\": -0.07934001340795543, \"variable\": \"column\", \"value\": \"Action_DIP\", \"label\": \"Action_DIP\"}, {\"component 0\": 0.6209921446459308, \"component 1\": 1.2784516057017628e-15, \"component 2\": 0.05770182793305871, \"variable\": \"column\", \"value\": \"Action_STRETCH\", \"label\": \"Action_STRETCH\"}, {\"component 0\": 0.6209921446459307, \"component 1\": -6.705540995554791e-16, \"component 2\": 0.057701827933056817, \"variable\": \"column\", \"value\": \"Age_ADULT\", \"label\": \"Age_ADULT\"}, {\"component 0\": -0.8538641988881545, \"component 1\": 7.042413232804728e-16, \"component 2\": -0.07934001340795291, \"variable\": \"column\", \"value\": \"Age_CHILD\", \"label\": \"Age_CHILD\"}, {\"component 0\": -0.7314664035372921, \"component 1\": -4.076017304527338e-16, \"component 2\": -0.054731083980793675, \"variable\": \"column\", \"value\": \"Inflated_F\", \"label\": \"Inflated_F\"}, {\"component 0\": 1.2539424060639297, \"component 1\": 3.9152862759667877e-16, \"component 2\": 0.09382471539564664, \"variable\": \"column\", \"value\": \"Inflated_T\", \"label\": \"Inflated_T\"}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"data\": {\"name\": \"data-77fcb8e636b3df57514d60e150c9d9f1\"}, \"mark\": {\"type\": \"circle\", \"size\": 50}, \"encoding\": {\"color\": {\"field\": \"variable\", \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"variable\", \"type\": \"nominal\"}, {\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"component 0\", \"type\": \"quantitative\"}, {\"field\": \"component 1\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"title\": \"component 0 \\u2014 40.17%\"}, \"field\": \"component 0\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"component 1 \\u2014 21.11%\"}, \"field\": \"component 1\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}}, \"selection\": {\"selector002\": {\"type\": \"interval\", \"bind\": \"scales\", \"encodings\": [\"x\", \"y\"]}}}, {\"data\": {\"name\": \"data-711147a303e3ceea7e2371bc22d80b7a\"}, \"mark\": {\"type\": \"circle\", \"size\": 50}, \"encoding\": {\"color\": {\"field\": \"variable\", \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"variable\", \"type\": \"nominal\"}, {\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"component 0\", \"type\": \"quantitative\"}, {\"field\": \"component 1\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"title\": \"component 0 \\u2014 40.17%\"}, \"field\": \"component 0\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"component 1 \\u2014 21.11%\"}, \"field\": \"component 1\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}}}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-77fcb8e636b3df57514d60e150c9d9f1\": [{\"component 0\": 0.705386799624833, \"component 1\": 5.369157675281349e-15, \"component 2\": 0.7586391105690958, \"variable\": \"row\", \"value\": \"0\", \"label\": 0}, {\"component 0\": -0.3865862994959901, \"component 1\": 5.724889175530574e-15, \"component 2\": 0.626063081684002, \"variable\": \"row\", \"value\": \"1\", \"label\": 1}, {\"component 0\": -0.38658629949599027, \"component 1\": 4.807799212409681e-15, \"component 2\": 0.6260630816840012, \"variable\": \"row\", \"value\": \"2\", \"label\": 2}, {\"component 0\": -0.8520140574664052, \"component 1\": 5.1087824590383175e-15, \"component 2\": 0.5624474892356488, \"variable\": \"row\", \"value\": \"3\", \"label\": 3}, {\"component 0\": 0.7835387510478179, \"component 1\": -0.6333333333333329, \"component 2\": 0.13020069134918655, \"variable\": \"row\", \"value\": \"4\", \"label\": 4}, {\"component 0\": 0.7835387510478179, \"component 1\": -0.6333333333333329, \"component 2\": 0.13020069134918655, \"variable\": \"row\", \"value\": \"5\", \"label\": 5}, {\"component 0\": -0.3084343480730051, \"component 1\": -0.6333333333333324, \"component 2\": -0.002375337535907171, \"variable\": \"row\", \"value\": \"6\", \"label\": 6}, {\"component 0\": -0.30843434807300524, \"component 1\": -0.6333333333333334, \"component 2\": -0.0023753375359081112, \"variable\": \"row\", \"value\": \"7\", \"label\": 7}, {\"component 0\": -0.7738621060434202, \"component 1\": -0.6333333333333331, \"component 2\": -0.06599092998426041, \"variable\": \"row\", \"value\": \"8\", \"label\": 8}, {\"component 0\": 0.7835387510478189, \"component 1\": 0.6333333333333334, \"component 2\": 0.1302006913491765, \"variable\": \"row\", \"value\": \"9\", \"label\": 9}, {\"component 0\": 0.7835387510478189, \"component 1\": 0.6333333333333334, \"component 2\": 0.1302006913491765, \"variable\": \"row\", \"value\": \"10\", \"label\": 10}, {\"component 0\": -0.3084343480730041, \"component 1\": 0.6333333333333339, \"component 2\": -0.0023753375359172324, \"variable\": \"row\", \"value\": \"11\", \"label\": 11}, {\"component 0\": -0.3084343480730043, \"component 1\": 0.6333333333333329, \"component 2\": -0.0023753375359181726, \"variable\": \"row\", \"value\": \"12\", \"label\": 12}, {\"component 0\": -0.7738621060434193, \"component 1\": 0.6333333333333332, \"component 2\": -0.06599092998427047, \"variable\": \"row\", \"value\": \"13\", \"label\": 13}, {\"component 0\": 0.8616907024708039, \"component 1\": -4.703117939444958e-15, \"component 2\": -0.49823772787073284, \"variable\": \"row\", \"value\": \"14\", \"label\": 14}, {\"component 0\": 0.8616907024708039, \"component 1\": -4.703117939444958e-15, \"component 2\": -0.49823772787073284, \"variable\": \"row\", \"value\": \"15\", \"label\": 15}, {\"component 0\": -0.23028239665001912, \"component 1\": -4.347386439195733e-15, \"component 2\": -0.6308137567558266, \"variable\": \"row\", \"value\": \"16\", \"label\": 16}, {\"component 0\": -0.23028239665001932, \"component 1\": -5.2644764023166255e-15, \"component 2\": -0.6308137567558275, \"variable\": \"row\", \"value\": \"17\", \"label\": 17}, {\"component 0\": -0.6957101546204343, \"component 1\": -4.963493155687989e-15, \"component 2\": -0.6944293492041799, \"variable\": \"row\", \"value\": \"18\", \"label\": 18}], \"data-711147a303e3ceea7e2371bc22d80b7a\": [{\"component 0\": 0.11730760677191546, \"component 1\": 0.6892024376045062, \"component 2\": -0.6412704755837051, \"variable\": \"column\", \"value\": \"Color_PURPLE\", \"label\": \"Color_PURPLE\"}, {\"component 0\": -0.13034178530212806, \"component 1\": -0.7657804862272294, \"component 2\": 0.7125227506485615, \"variable\": \"column\", \"value\": \"Color_YELLOW\", \"label\": \"Color_YELLOW\"}, {\"component 0\": 0.1173076067719144, \"component 1\": -0.6892024376045159, \"component 2\": -0.641270475583695, \"variable\": \"column\", \"value\": \"Size_LARGE\", \"label\": \"Size_LARGE\"}, {\"component 0\": -0.13034178530212692, \"component 1\": 0.7657804862272393, \"component 2\": 0.7125227506485502, \"variable\": \"column\", \"value\": \"Size_SMALL\", \"label\": \"Size_SMALL\"}, {\"component 0\": -0.8538641988881541, \"component 1\": -6.209900073327629e-16, \"component 2\": -0.0793400134079546, \"variable\": \"column\", \"value\": \"Action_DIP\", \"label\": \"Action_DIP\"}, {\"component 0\": 0.6209921446459306, \"component 1\": 2.1065276378011284e-16, \"component 2\": 0.05770182793305838, \"variable\": \"column\", \"value\": \"Action_STRETCH\", \"label\": \"Action_STRETCH\"}, {\"component 0\": 0.6209921446459306, \"component 1\": -6.15834260569712e-16, \"component 2\": 0.05770182793305778, \"variable\": \"column\", \"value\": \"Age_ADULT\", \"label\": \"Age_ADULT\"}, {\"component 0\": -0.8538641988881541, \"component 1\": 4.810967786383547e-16, \"component 2\": -0.0793400134079539, \"variable\": \"column\", \"value\": \"Age_CHILD\", \"label\": \"Age_CHILD\"}, {\"component 0\": -0.7314664035372919, \"component 1\": 9.568847313504198e-17, \"component 2\": -0.054731083980793564, \"variable\": \"column\", \"value\": \"Inflated_F\", \"label\": \"Inflated_F\"}, {\"component 0\": 1.253942406063929, \"component 1\": -6.325533971750396e-16, \"component 2\": 0.09382471539564684, \"variable\": \"column\", \"value\": \"Inflated_T\", \"label\": \"Inflated_T\"}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -601,7 +635,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 19,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.264339Z",
@@ -616,54 +650,54 @@
       "text/html": [
        "<style type=\"text/css\">\n",
        "</style>\n",
-       "<table id=\"T_d37d9\">\n",
+       "<table id=\"T_db76b\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_d37d9_level0_col0\" class=\"col_heading level0 col0\" >0</th>\n",
-       "      <th id=\"T_d37d9_level0_col1\" class=\"col_heading level0 col1\" >1</th>\n",
-       "      <th id=\"T_d37d9_level0_col2\" class=\"col_heading level0 col2\" >2</th>\n",
+       "      <th id=\"T_db76b_level0_col0\" class=\"col_heading level0 col0\" >0</th>\n",
+       "      <th id=\"T_db76b_level0_col1\" class=\"col_heading level0 col1\" >1</th>\n",
+       "      <th id=\"T_db76b_level0_col2\" class=\"col_heading level0 col2\" >2</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_d37d9_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
-       "      <td id=\"T_d37d9_row0_col0\" class=\"data row0 col0\" >7%</td>\n",
-       "      <td id=\"T_d37d9_row0_col1\" class=\"data row0 col1\" >0%</td>\n",
-       "      <td id=\"T_d37d9_row0_col2\" class=\"data row0 col2\" >16%</td>\n",
+       "      <th id=\"T_db76b_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
+       "      <td id=\"T_db76b_row0_col0\" class=\"data row0 col0\" >7%</td>\n",
+       "      <td id=\"T_db76b_row0_col1\" class=\"data row0 col1\" >0%</td>\n",
+       "      <td id=\"T_db76b_row0_col2\" class=\"data row0 col2\" >16%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d37d9_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
-       "      <td id=\"T_d37d9_row1_col0\" class=\"data row1 col0\" >2%</td>\n",
-       "      <td id=\"T_d37d9_row1_col1\" class=\"data row1 col1\" >0%</td>\n",
-       "      <td id=\"T_d37d9_row1_col2\" class=\"data row1 col2\" >11%</td>\n",
+       "      <th id=\"T_db76b_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
+       "      <td id=\"T_db76b_row1_col0\" class=\"data row1 col0\" >2%</td>\n",
+       "      <td id=\"T_db76b_row1_col1\" class=\"data row1 col1\" >0%</td>\n",
+       "      <td id=\"T_db76b_row1_col2\" class=\"data row1 col2\" >11%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d37d9_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
-       "      <td id=\"T_d37d9_row2_col0\" class=\"data row2 col0\" >2%</td>\n",
-       "      <td id=\"T_d37d9_row2_col1\" class=\"data row2 col1\" >0%</td>\n",
-       "      <td id=\"T_d37d9_row2_col2\" class=\"data row2 col2\" >11%</td>\n",
+       "      <th id=\"T_db76b_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
+       "      <td id=\"T_db76b_row2_col0\" class=\"data row2 col0\" >2%</td>\n",
+       "      <td id=\"T_db76b_row2_col1\" class=\"data row2 col1\" >0%</td>\n",
+       "      <td id=\"T_db76b_row2_col2\" class=\"data row2 col2\" >11%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d37d9_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
-       "      <td id=\"T_d37d9_row3_col0\" class=\"data row3 col0\" >10%</td>\n",
-       "      <td id=\"T_d37d9_row3_col1\" class=\"data row3 col1\" >0%</td>\n",
-       "      <td id=\"T_d37d9_row3_col2\" class=\"data row3 col2\" >9%</td>\n",
+       "      <th id=\"T_db76b_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
+       "      <td id=\"T_db76b_row3_col0\" class=\"data row3 col0\" >10%</td>\n",
+       "      <td id=\"T_db76b_row3_col1\" class=\"data row3 col1\" >0%</td>\n",
+       "      <td id=\"T_db76b_row3_col2\" class=\"data row3 col2\" >9%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d37d9_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
-       "      <td id=\"T_d37d9_row4_col0\" class=\"data row4 col0\" >8%</td>\n",
-       "      <td id=\"T_d37d9_row4_col1\" class=\"data row4 col1\" >10%</td>\n",
-       "      <td id=\"T_d37d9_row4_col2\" class=\"data row4 col2\" >0%</td>\n",
+       "      <th id=\"T_db76b_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
+       "      <td id=\"T_db76b_row4_col0\" class=\"data row4 col0\" >8%</td>\n",
+       "      <td id=\"T_db76b_row4_col1\" class=\"data row4 col1\" >10%</td>\n",
+       "      <td id=\"T_db76b_row4_col2\" class=\"data row4 col2\" >0%</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x126ecac80>"
+       "<pandas.io.formats.style.Styler at 0x7f879d1a1f90>"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -674,7 +708,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 20,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.464586Z",
@@ -689,54 +723,54 @@
       "text/html": [
        "<style type=\"text/css\">\n",
        "</style>\n",
-       "<table id=\"T_7ef8b\">\n",
+       "<table id=\"T_ccbc0\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_7ef8b_level0_col0\" class=\"col_heading level0 col0\" >0</th>\n",
-       "      <th id=\"T_7ef8b_level0_col1\" class=\"col_heading level0 col1\" >1</th>\n",
-       "      <th id=\"T_7ef8b_level0_col2\" class=\"col_heading level0 col2\" >2</th>\n",
+       "      <th id=\"T_ccbc0_level0_col0\" class=\"col_heading level0 col0\" >0</th>\n",
+       "      <th id=\"T_ccbc0_level0_col1\" class=\"col_heading level0 col1\" >1</th>\n",
+       "      <th id=\"T_ccbc0_level0_col2\" class=\"col_heading level0 col2\" >2</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_7ef8b_level0_row0\" class=\"row_heading level0 row0\" >Color_PURPLE</th>\n",
-       "      <td id=\"T_7ef8b_row0_col0\" class=\"data row0 col0\" >0%</td>\n",
-       "      <td id=\"T_7ef8b_row0_col1\" class=\"data row0 col1\" >24%</td>\n",
-       "      <td id=\"T_7ef8b_row0_col2\" class=\"data row0 col2\" >23%</td>\n",
+       "      <th id=\"T_ccbc0_level0_row0\" class=\"row_heading level0 row0\" >Color_PURPLE</th>\n",
+       "      <td id=\"T_ccbc0_row0_col0\" class=\"data row0 col0\" >0%</td>\n",
+       "      <td id=\"T_ccbc0_row0_col1\" class=\"data row0 col1\" >24%</td>\n",
+       "      <td id=\"T_ccbc0_row0_col2\" class=\"data row0 col2\" >23%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_7ef8b_level0_row1\" class=\"row_heading level0 row1\" >Color_YELLOW</th>\n",
-       "      <td id=\"T_7ef8b_row1_col0\" class=\"data row1 col0\" >0%</td>\n",
-       "      <td id=\"T_7ef8b_row1_col1\" class=\"data row1 col1\" >26%</td>\n",
-       "      <td id=\"T_7ef8b_row1_col2\" class=\"data row1 col2\" >26%</td>\n",
+       "      <th id=\"T_ccbc0_level0_row1\" class=\"row_heading level0 row1\" >Color_YELLOW</th>\n",
+       "      <td id=\"T_ccbc0_row1_col0\" class=\"data row1 col0\" >0%</td>\n",
+       "      <td id=\"T_ccbc0_row1_col1\" class=\"data row1 col1\" >26%</td>\n",
+       "      <td id=\"T_ccbc0_row1_col2\" class=\"data row1 col2\" >26%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_7ef8b_level0_row2\" class=\"row_heading level0 row2\" >Size_LARGE</th>\n",
-       "      <td id=\"T_7ef8b_row2_col0\" class=\"data row2 col0\" >0%</td>\n",
-       "      <td id=\"T_7ef8b_row2_col1\" class=\"data row2 col1\" >24%</td>\n",
-       "      <td id=\"T_7ef8b_row2_col2\" class=\"data row2 col2\" >23%</td>\n",
+       "      <th id=\"T_ccbc0_level0_row2\" class=\"row_heading level0 row2\" >Size_LARGE</th>\n",
+       "      <td id=\"T_ccbc0_row2_col0\" class=\"data row2 col0\" >0%</td>\n",
+       "      <td id=\"T_ccbc0_row2_col1\" class=\"data row2 col1\" >24%</td>\n",
+       "      <td id=\"T_ccbc0_row2_col2\" class=\"data row2 col2\" >23%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_7ef8b_level0_row3\" class=\"row_heading level0 row3\" >Size_SMALL</th>\n",
-       "      <td id=\"T_7ef8b_row3_col0\" class=\"data row3 col0\" >0%</td>\n",
-       "      <td id=\"T_7ef8b_row3_col1\" class=\"data row3 col1\" >26%</td>\n",
-       "      <td id=\"T_7ef8b_row3_col2\" class=\"data row3 col2\" >26%</td>\n",
+       "      <th id=\"T_ccbc0_level0_row3\" class=\"row_heading level0 row3\" >Size_SMALL</th>\n",
+       "      <td id=\"T_ccbc0_row3_col0\" class=\"data row3 col0\" >0%</td>\n",
+       "      <td id=\"T_ccbc0_row3_col1\" class=\"data row3 col1\" >26%</td>\n",
+       "      <td id=\"T_ccbc0_row3_col2\" class=\"data row3 col2\" >26%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_7ef8b_level0_row4\" class=\"row_heading level0 row4\" >Action_DIP</th>\n",
-       "      <td id=\"T_7ef8b_row4_col0\" class=\"data row4 col0\" >15%</td>\n",
-       "      <td id=\"T_7ef8b_row4_col1\" class=\"data row4 col1\" >0%</td>\n",
-       "      <td id=\"T_7ef8b_row4_col2\" class=\"data row4 col2\" >0%</td>\n",
+       "      <th id=\"T_ccbc0_level0_row4\" class=\"row_heading level0 row4\" >Action_DIP</th>\n",
+       "      <td id=\"T_ccbc0_row4_col0\" class=\"data row4 col0\" >15%</td>\n",
+       "      <td id=\"T_ccbc0_row4_col1\" class=\"data row4 col1\" >0%</td>\n",
+       "      <td id=\"T_ccbc0_row4_col2\" class=\"data row4 col2\" >0%</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x1788237c0>"
+       "<pandas.io.formats.style.Styler at 0x7f879d1f0b20>"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -755,7 +789,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 21,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.478001Z",
@@ -795,25 +829,25 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.461478</td>\n",
-       "      <td>6.638620e-29</td>\n",
+       "      <td>2.673675e-29</td>\n",
        "      <td>0.533786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.152256</td>\n",
-       "      <td>7.385455e-29</td>\n",
+       "      <td>3.338988e-29</td>\n",
        "      <td>0.399316</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0.152256</td>\n",
-       "      <td>3.978637e-29</td>\n",
+       "      <td>2.354904e-29</td>\n",
        "      <td>0.399316</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>0.653335</td>\n",
-       "      <td>4.251294e-29</td>\n",
+       "      <td>2.348969e-29</td>\n",
        "      <td>0.284712</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -828,14 +862,14 @@
       ],
       "text/plain": [
        "          0             1         2\n",
-       "0  0.461478  6.638620e-29  0.533786\n",
-       "1  0.152256  7.385455e-29  0.399316\n",
-       "2  0.152256  3.978637e-29  0.399316\n",
-       "3  0.653335  4.251294e-29  0.284712\n",
+       "0  0.461478  2.673675e-29  0.533786\n",
+       "1  0.152256  3.338988e-29  0.399316\n",
+       "2  0.152256  2.354904e-29  0.399316\n",
+       "3  0.653335  2.348969e-29  0.284712\n",
        "4  0.592606  3.871772e-01  0.016363"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -846,7 +880,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 22,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-10-11T22:33:03.494732Z",
@@ -910,7 +944,7 @@
        "    <tr>\n",
        "      <th>Action_DIP</th>\n",
        "      <td>0.530243</td>\n",
-       "      <td>2.774134e-30</td>\n",
+       "      <td>2.804572e-31</td>\n",
        "      <td>0.004578</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -923,10 +957,10 @@
        "Color_YELLOW  0.015290  5.277778e-01  0.456920\n",
        "Size_LARGE    0.015290  5.277778e-01  0.456920\n",
        "Size_SMALL    0.015290  5.277778e-01  0.456920\n",
-       "Action_DIP    0.530243  2.774134e-30  0.004578"
+       "Action_DIP    0.530243  2.804572e-31  0.004578"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -934,6 +968,13 @@
    "source": [
     "mca.column_cosine_similarities(dataset).head()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -952,7 +993,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {

From f87c84326829268319616b5c68a77062d8b84453 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Sun, 8 Sep 2024 13:04:38 +0530
Subject: [PATCH 06/10] fixed an issue to handle unknown columns during one hot
 encoding for MCA analysis

---
 prince/mca.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/prince/mca.py b/prince/mca.py
index 84466662..7abe9b76 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -31,7 +31,7 @@ def __init__(
         engine="sklearn",
         one_hot = True,
         get_dummies = False,#if True, use pd.get_dummies to one-hot encode the data
-        one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False), #OneHotEncoder object to use
+        one_hot_encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=bool), #OneHotEncoder object to use
         is_one_hot_fitted = False
     ):
         super().__init__(
@@ -55,14 +55,28 @@ def _prepare(self, X):
                 return X
             else:
                 if self.is_one_hot_fitted == False:
+                    #if the one_hot_encoder is not fitted, to fit and also set the is_one_hot_fitted variable to True
                     X_enc = self.one_hot_encoder.fit_transform(X)
                     X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
                     self.is_one_hot_fitted = True
                     return X_enc
                 else:
-                    X_enc = self.one_hot_encoder.transform(X)
-                    X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
-                    return X_enc
+                    #checking if the columns fed to the onehot encoder and the columns fitted to the onehot encoder are the same
+                    oh_cols = set(self.one_hot_encoder.feature_names_in_.tolist())
+                    X_cols = set(X.columns.tolist())
+                    
+                    if oh_cols == X_cols:
+                        #if the fitted cols are the same as the inferencing columns, then can transform
+                        X_enc = self.one_hot_encoder.transform(X)
+                        X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
+                        return X_enc
+                    else:
+                        #if the fitted cols are different to the inferencing columns, then should fit the onehot encoder again, to handle unit tests
+                        print(X_cols)
+                        print(oh_cols)
+                        X_enc = self.one_hot_encoder.fit_transform(X)
+                        X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
+                        return X_enc
         return X
 
     @utils.check_is_dataframe_input

From e63d74b162a42b3f2c44ca2903766fa992426ab7 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Sun, 8 Sep 2024 13:19:02 +0530
Subject: [PATCH 07/10] fixing merge issue in mca notebook in docs

---
 docs/content/mca.ipynb | 83 +++++++++++-------------------------------
 1 file changed, 21 insertions(+), 62 deletions(-)

diff --git a/docs/content/mca.ipynb b/docs/content/mca.ipynb
index 9bb2de0a..4bb49112 100644
--- a/docs/content/mca.ipynb
+++ b/docs/content/mca.ipynb
@@ -159,7 +159,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:04.556009Z",
@@ -168,15 +168,7 @@
      "shell.execute_reply": "2024-09-07T18:18:05.003821Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(19, 5)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import prince\n",
     "\n",
@@ -186,9 +178,7 @@
     "    copy=True,\n",
     "    check_input=True,\n",
     "    engine='sklearn',\n",
-    "    random_state=42,\n",
-    "    one_hot=True,\n",
-    "    get_dummies=True\n",
+    "    random_state=42\n",
     ")\n",
     "mca = mca.fit(dataset)"
    ]
@@ -202,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.011574Z",
@@ -211,15 +201,7 @@
      "shell.execute_reply": "2024-09-07T18:18:05.036730Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(19, 10)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "one_hot = pd.get_dummies(dataset)\n",
     "\n",
@@ -237,7 +219,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 4,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.042434Z",
@@ -310,7 +292,7 @@
        "2              0.186        18.56%                     79.84%"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -329,7 +311,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 5,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.067678Z",
@@ -409,7 +391,7 @@
        "4  0.783539 -6.333333e-01  0.130201"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -420,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 6,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.095132Z",
@@ -500,7 +482,7 @@
        "Action_DIP   -0.853864 -2.712409e-15 -0.079340"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -519,7 +501,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 7,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.127742Z",
@@ -529,22 +511,6 @@
     }
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
-      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
-      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
-      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
-      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
-      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
-      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
-      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
-      "/opt/anaconda3/envs/nibm_ml/lib/python3.10/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
-      "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n"
-     ]
-    },
     {
      "data": {
       "text/html": [
@@ -608,7 +574,7 @@
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -635,7 +601,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 8,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.215554Z",
@@ -697,7 +663,7 @@
        "<pandas.io.formats.style.Styler at 0x14745b7d0>"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -708,7 +674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 9,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.251107Z",
@@ -770,7 +736,7 @@
        "<pandas.io.formats.style.Styler at 0x147429950>"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -789,7 +755,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 10,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.274947Z",
@@ -869,7 +835,7 @@
        "4  0.592606  3.871772e-01  0.016363"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -880,7 +846,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 11,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-09-07T18:18:05.304890Z",
@@ -960,7 +926,7 @@
        "Action_DIP    0.530243  5.350665e-30  0.004578"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -968,13 +934,6 @@
    "source": [
     "mca.column_cosine_similarities(dataset).head()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 3bcff9c2b5897492a39f53d38572dda8ef5959ef Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Sun, 8 Sep 2024 13:25:52 +0530
Subject: [PATCH 08/10] removed code lines kept for debugging

---
 prince/mca.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/prince/mca.py b/prince/mca.py
index 7abe9b76..f506328a 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -72,8 +72,6 @@ def _prepare(self, X):
                         return X_enc
                     else:
                         #if the fitted cols are different to the inferencing columns, then should fit the onehot encoder again, to handle unit tests
-                        print(X_cols)
-                        print(oh_cols)
                         X_enc = self.one_hot_encoder.fit_transform(X)
                         X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))
                         return X_enc

From bfc91790bb84dc2aa54d487014831d33bf334ec0 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Sun, 8 Sep 2024 20:35:17 +0530
Subject: [PATCH 09/10] 2 errors caused by print code for logging

---
 prince/mca.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/prince/mca.py b/prince/mca.py
index f506328a..8d7b8492 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -94,7 +94,6 @@ def fit(self, X, y=None):
         self.K_ = X.shape[1]
 
         # One-hot encode the data
-        print(X.shape)
         one_hot = self._prepare(X)
 
         # We need the number of columns to apply the Greenacre correction

From ef68f6b77d5eb791d0113bd956d3625093899317 Mon Sep 17 00:00:00 2001
From: Vaseekaran Varatharajah <vvasee1996@gmail.com>
Date: Sun, 8 Sep 2024 21:28:49 +0530
Subject: [PATCH 10/10] fixed a clean code issue

---
 prince/mca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prince/mca.py b/prince/mca.py
index 8d7b8492..3eef94a0 100644
--- a/prince/mca.py
+++ b/prince/mca.py
@@ -54,7 +54,7 @@ def _prepare(self, X):
                 X = pd.get_dummies(X, columns=X.columns)
                 return X
             else:
-                if self.is_one_hot_fitted == False:
+                if self.is_one_hot_fitted is False:
                     #if the one_hot_encoder is not fitted, to fit and also set the is_one_hot_fitted variable to True
                     X_enc = self.one_hot_encoder.fit_transform(X)
                     X_enc = pd.DataFrame(X_enc, columns=self.one_hot_encoder.get_feature_names_out(X.columns))