handle missings (#13)

* handle missings
Polkas · Dec 23, 2023 · 4de1583 · 4de1583
1 parent 62d8228
commit 4de1583
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## v0.1.4.9008
+## v0.1.4.9009
 
 - New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent.
 - Improved the lack of support for NaN and None in the `get_mappings`.

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 ]
 description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
 readme = "README.md"
-version = "0.1.4.9008"
+version = "0.1.4.9009"
 requires-python = ">=3.8"
 keywords = ["panel", "categorical", "longtitudal", "inconsistent", "cat2cat"]
 license = {text = "Apache License 2.0 | file LICENSE"}

diff --git a/src/cat2cat/cat2cat.py b/src/cat2cat/cat2cat.py
@@ -38,6 +38,10 @@ def cat2cat(
         `mappings.trans` arg columns and the `data.cat_var` column have to be of the same type.
         When ml part is applied then `ml.cat_var` has to have the same type too.
         Changes have to be made at the same time for the mapping table and datasets.
+        
+        3. Missing values in the mapping table or categorical variable can cause problems.
+        It is recommended to use string or float types in the mapping table and for categorical variable.
+        Alternative solution can be representing missing values as a specific number (9999) or string ("Missing").
 
     >>> from cat2cat import cat2cat
     >>> from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml

diff --git a/src/cat2cat/cat2cat_ml.py b/src/cat2cat/cat2cat_ml.py
@@ -237,14 +237,14 @@ def cat2cat_ml_run(
 
     return cat2cat_ml_run_results(res, mappings, ml, kwargs)
 
-
 def _cat2cat_ml(
     ml: cat2cat_ml, mapp: Dict[Any, Any], target_df: DataFrame, cat_var_target: str
 ) -> None:
     """cat2cat ml optional part"""
     for target_cat in list(mapp.keys()):
         base_cats = mapp[target_cat]
         ml_cat_var = ml.data[ml.cat_var]
+
         if (not any(in1d(base_cats, ml_cat_var.unique()))) or (len(base_cats) == 1):
             continue
 

diff --git a/src/cat2cat/mappings.py b/src/cat2cat/mappings.py
@@ -25,6 +25,8 @@ def get_mappings(x: Table) -> Dict[str, Dict[Any, List[Any]]]:
         Dict[str, Dict[Any, List[Any]]]: dict with 2 internal dicts, `to_old` and `to_new`.
     Note:
         There was made an effort to handle missings properly but please try to avoid of using NaN or None.
+        It is recommended to use string or float types.
+        Alternative solution can be representing missing values as a specific number (9999) or string ("Missing").
 
     >>> from cat2cat.mappings import get_mappings
     >>> from numpy import array, nan
@@ -38,9 +40,9 @@ def get_mappings(x: Table) -> Dict[str, Dict[Any, List[Any]]]:
     >>> mappings["to_new"]
     {1111.0: [111101.0, 111102.0], 1123.0: [111405.0], 1212.0: [112006.0, 112008.0, 112090.0, nan], nan: [111405.0]}
     """
-    assert (len(x.shape) == 2) and (
+    assert hasattr(x, "shape") and ((len(x.shape) == 2) and (
         x.shape[1] == 2
-    ), "x should have 2 dimensions and the second one is equal to 2 (columns)"
+    )), "x should have 2 dimensions and the second one is equal to 2 (columns)"
 
     if isinstance(x, DataFrame):
         return get_mappings_df(x)

diff --git a/tests/test_cat2cat.py b/tests/test_cat2cat.py
@@ -34,7 +34,7 @@ def int_round(x: float) -> int:
 )
 
 trans_int = trans.copy()
-trans_int = trans_int.astype({"old": "Int64", "new": "Int64"})
+trans_int = trans_int.astype({"old": "float", "new": "float"})
 
 nr_rows_old = {"backward": 227662, "forward": 17223}
 nr_rows_new = {"backward": 17323, "forward": 18680}
@@ -59,7 +59,7 @@ def int_round(x: float) -> int:
     },
 }
 which_target_origin = {"backward": ("old", "new"), "forward": ("new", "old")}
-code_var_name = {"backward": "code", "forward": "code4"}
+ml_test_data = {"backward": {"var": "code", "data": occup.loc[occup.year >= 2010, :].copy()}, "forward": {"var": "code4", "data": occup.loc[occup.year <= 2008, :].copy()}}
 
 
 @pytest.mark.parametrize("direction", ["backward", "forward"])
@@ -148,7 +148,7 @@ def test_cat2cat_custom_freqs(direction, cat_type):
     assert data_dict[cat_type]["new"].equals(n)
 
 @pytest.mark.parametrize("cat_type", ["str", "int"])
-@pytest.mark.parametrize("direction", ["backward"])
+@pytest.mark.parametrize("direction", ["backward", "forward"])
 def test_cat2cat_ml(direction, cat_type):
     o = data_dict[cat_type]["old"].copy()
     n = data_dict[cat_type]["new"].copy()
@@ -159,8 +159,8 @@ def test_cat2cat_ml(direction, cat_type):
         data_dict[cat_type]["trans"], direction, data_dict[cat_type]["freqs"][direction]
     )
     ml = cat2cat_ml(
-        occup.loc[occup.year >= 2010, :].copy(),
-        code_var_name[direction],
+        ml_test_data[direction]["data"],
+        ml_test_data[direction]["var"],
         ["salary", "age", "edu", "sex"],
         [DecisionTreeClassifier(), LinearDiscriminantAnalysis()],
     )

diff --git a/tests/test_mappings.py b/tests/test_mappings.py
@@ -123,6 +123,21 @@ def test_get_mappings_DataFrame():
     assert str(actual) == str(expected)
 
 
+@pytest.mark.parametrize("x", [1, "", [], {}])
+def test_get_mappings_wrong(x):
+    with pytest.raises(AssertionError):
+        get_mappings(x)
+
+
+class class_with_shape:
+    shape = [1, 2]
+
+
+def test_get_mappings_shape():
+    with pytest.raises(TypeError):
+        get_mappings(class_with_shape())
+
+
 # test with NaNs
 def test_get_mappings_nan_str():
     trans2 = trans.copy()
@@ -239,21 +254,6 @@ def test_get_mappings_nan_float():
     assert str(actual) == str(expected)
 
 
-@pytest.mark.parametrize("x", [1, "", [], {}])
-def test_get_mappings_wrong(x):
-    with pytest.raises(AttributeError):
-        get_mappings(x)
-
-
-class class_with_shape:
-    shape = [1, 2]
-
-
-def test_get_mappings_shape():
-    with pytest.raises(TypeError):
-        get_mappings(class_with_shape())
-
-
 def test_get_mappings_different_types():
     trans2 = trans.copy()
     trans2["old"] = trans2["old"].astype(float)