missings (#8)

Polkas · Oct 23, 2023 · 604012c · 604012c
1 parent 9d5d12d
commit 604012c
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 65 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## v0.1.4.9002
+## v0.1.4.9004
 
 - Improved the lack of support for NaN and None in the `get_mappings`.
 - Fixed a bug that `cat2cat_ml.features` can be only a `list` not a `Sequence`.

diff --git a/README.md b/README.md
@@ -24,7 +24,9 @@
 
 There is offered the cat2cat procedure to map a categorical variable according to a mapping (transition) table between two different time points. The mapping (transition) table should to have a candidate for each category from the targeted for an update period. The main rule is to replicate the observation if it could be assigned to a few categories, then using simple frequencies or statistical methods to approximate probabilities of being assigned to each of them.
 
-This algorithm was invented and implemented in the paper by (Nasinski, Majchrowska and Broniatowska (2020) doi:10.24425/cejeme.2020.134747).
+**This algorithm was invented and implemented in the paper by [(Nasinski, Majchrowska and Broniatowska (2020))](https://doi.org/10.24425/cejeme.2020.134747).**
+
+**For more details please read the paper by [(Nasinski, Gajowniczek (2023))](https://doi.org/10.1016/j.softx.2023.101525).**
 
 ## Installation
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cat2cat"
-version = "0.1.4.9003"
+version = "0.1.4.9004"
 description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
 authors = ["Maciej Nasinski"]
 license = "MIT"

diff --git a/src/cat2cat/cat2cat.py b/src/cat2cat/cat2cat.py
@@ -33,12 +33,9 @@ def cat2cat(
         When ml model is broken then weights from simple frequencies are taken.
         `knn` method is recommended for smaller datasets.
 
-        2. Please be sure that the categorical variable is of the same type in all cases.
+        2. Please be sure that the categorical variable is of the same type in all places.
         `mappings.trans` arg columns and the `data.cat_var` column have to be of the same type.
-        When ml part applied `ml.cat_var` has to have the same type too.
-
-        3. Please covert all numpy.NaN to some numeric value like 999999.
-        None`s in a pandas column have to be converted to a "None" character.
+        When ml part is applied then `ml.cat_var` has to have the same type too.
         Changes have to be made at the same time for the mapping table and datasets.
 
     >>> from cat2cat import cat2cat
@@ -156,7 +153,6 @@ def _cat2cat_ml(
     for target_cat in list(mapp.keys()):
         base_cats = mapp[target_cat]
         ml_cat_var = ml.data[ml.cat_var]
-
         if (not any(in1d(base_cats, ml_cat_var.unique()))) or (len(base_cats) == 1):
             continue
 

diff --git a/src/cat2cat/mappings.py b/src/cat2cat/mappings.py
@@ -2,6 +2,7 @@
 from numpy import ndarray, unique, repeat, array, round, unique, sort, isnan
 
 from collections.abc import Iterable
+from collections import OrderedDict
 from typing import Union, Optional, Any, List, Dict, Sequence
 
 __all__ = ["get_mappings", "cat_apply_freq", "get_freqs"]
@@ -22,20 +23,19 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
     Returns:
         Dict[str, Dict[Any, List[Any]]]: dict with 2 internal dicts, `to_old` and `to_new`.
     Note:
-        Please covert all numpy.NaN to some numeric value like 999999.
-        None`s in a pandas column have to be converted to a "None" character.
+        There was made an effort to handle missings properly but please try to avoid of using NaN or None.
 
     >>> from cat2cat.mappings import get_mappings
-    >>> from numpy import array
+    >>> from numpy import array, nan
     >>> trans = array([
-    ...   [1111, 111101], [1111, 111102], [1123, 111405],
-    ...   [1212, 112006], [1212, 112008], [1212, 112090],
+    ...   [1111, 111101], [1111, 111102], [1123, 111405], [nan, 111405],
+    ...   [1212, 112006], [1212, 112008], [1212, 112090], [1212, nan],
     ... ])
     >>> mappings = get_mappings(trans)
     >>> mappings["to_old"]
-    {112006: [1212], 112008: [1212], 111405: [1123], 112090: [1212], 111101: [1111], 111102: [1111]}
+    {111101.0: [1111.0], 111102.0: [1111.0], 111405.0: [1123.0, nan], 112006.0: [1212.0], 112008.0: [1212.0], 112090.0: [1212.0], nan: [1212.0]}
     >>> mappings["to_new"]
-    {1123: [111405], 1212: [112006, 112008, 112090], 1111: [111101, 111102]}
+    {1111.0: [111101.0, 111102.0], 1123.0: [111405.0], 1212.0: [112006.0, 112008.0, 112090.0, nan], nan: [111405.0]}
     """
 
     assert (len(x.shape) == 2) and (
@@ -62,8 +62,8 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
         ff[which_ff_null | (ff == None)] = "None"
         ss[which_ss_null | (ss == None)] = "None"
 
-    from_old = unique(ff)
-    from_new = unique(ss)
+    from_old = list(OrderedDict.fromkeys(ff))
+    from_new = list(OrderedDict.fromkeys(ss))
 
     to_old = dict()
     for e in from_new:

diff --git a/tests/test_cat2cat.py b/tests/test_cat2cat.py
@@ -3,7 +3,7 @@
 from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
 from cat2cat.cat2cat_utils import dummy_c2c
 from pandas import concat, DataFrame
-from numpy import round, setdiff1d
+from numpy import round, setdiff1d, nan
 import pytest
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.tree import DecisionTreeClassifier
@@ -30,12 +30,11 @@ def int_round(x: float) -> int:
 trans = load_trans()
 # impute missing values
 trans = concat(
-    [trans, DataFrame({"old": "99999", "new": setdiff1d(o_new.code, trans.new)})]
+    [trans, DataFrame({"old": nan, "new": setdiff1d(o_new.code, trans.new)})]
 )
 
 trans_int = trans.copy()
-trans_int.loc[trans_int["old"].isnull(), "old"] = "99999"
-trans_int = trans_int.astype({"old": int, "new": int})
+trans_int = trans_int.astype({"old": "Int64", "new": "Int64"})
 
 nr_rows_old = {"backward": 227662, "forward": 17223}
 nr_rows_new = {"backward": 17323, "forward": 18680}

diff --git a/tests/test_mappings.py b/tests/test_mappings.py
@@ -37,28 +37,28 @@
 def test_get_freqs_range():
     actual = get_freqs(list(range(10)))
     expected = {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
-    assert actual == expected
+    assert str(actual) == str(expected)
 
 
 def test_get_freqs_random_array():
     seed(1234)
     actual = get_freqs(choice(5, 100, replace=True))
     expected = {0: 14, 1: 25, 2: 21, 3: 17, 4: 23}
-    assert actual == expected
+    assert str(actual) == str(expected)
 
 
 def test_get_freqs_random_list():
     seed(1234)
     actual = get_freqs(list(choice(5, 100, replace=True)))
     expected = {0: 14, 1: 25, 2: 21, 3: 17, 4: 23}
-    assert actual == expected
+    assert str(actual) == str(expected)
 
 
 def test_get_freqs_multiplier():
     seed(1234)
     actual = get_freqs(choice(5, 100, replace=True), choice(5, 100, replace=True))
     expected = {0: 25, 1: 60, 2: 40, 3: 27, 4: 43}
-    assert actual == expected
+    assert str(actual) == str(expected)
 
 
 def test_get_freqs_multiplier_len():
@@ -74,39 +74,39 @@ def test_get_mappings_array():
     actual = get_mappings(array(trans_small))
     expected = {
         "to_old": {
-            112001: [1212],
-            112002: [1212],
-            112006: [1212],
-            112007: [1211],
-            112008: [1212],
-            112013: [1212],
-            112016: [1211],
-            112017: [1211],
-            112019: [1211],
-            111401: [1122],
+            111101: [1111],
+            111102: [1111],
+            111103: [1111],
+            111201: [1112],
+            111202: [1112],
+            111301: [1112],
             111402: [1121],
+            111401: [1122],
             111403: [1122],
             111404: [1122],
             111405: [1123],
-            111301: [1112],
+            112007: [1211],
+            112016: [1211],
+            112017: [1211],
+            112019: [1211],
+            112002: [1212],
+            112013: [1212],
+            112001: [1212],
+            112006: [1212],
+            112008: [1212],
             112090: [1212],
-            111201: [1112],
-            111202: [1112],
-            111101: [1111],
-            111102: [1111],
-            111103: [1111],
         },
         "to_new": {
+            1111: [111101, 111102, 111103],
+            1112: [111201, 111202, 111301],
             1121: [111402],
             1122: [111401, 111403, 111404],
             1123: [111405],
-            1111: [111101, 111102, 111103],
-            1112: [111201, 111202, 111301],
             1211: [112007, 112016, 112017, 112019],
             1212: [112001, 112002, 112006, 112008, 112013, 112090],
         },
     }
-    assert actual == expected
+    assert str(actual) == str(expected)
 
 
 def test_get_mappings_DataFrame():
@@ -120,7 +120,7 @@ def test_get_mappings_DataFrame():
         "331508",
         "333902",
     ]
-    assert actual == expected
+    assert str(actual) == str(expected)
 
 
 # test with NaNs
@@ -139,29 +139,29 @@ def test_get_mappings_nan_str():
             "111201": ["1112"],
             "111202": ["1112"],
             "111301": ["1112"],
-            "111401": ["1122"],
             "111402": ["1121"],
+            "111401": ["1122"],
             "111403": ["1122"],
             "111404": ["1122"],
             "111405": ["1123"],
-            "112001": ["1212"],
+            "112007": ["1211"],
+            "112016": ["1211"],
+            "112017": ["1211"],
+            "112019": ["1211"],
             "112002": ["1212"],
-            "112003": ["1212"],
-            "112004": ["1212"],
-            "112005": ["1212"],
+            "112013": ["1212"],
+            "112001": ["1212"],
             "112006": ["1212"],
-            "112007": ["1211"],
             "112008": ["1212"],
             "112009": ["1212"],
             "112010": ["1212"],
-            "112011": ["1212"],
-            "112012": ["1212"],
-            "112013": ["1212"],
             "112014": ["1212"],
+            "112005": ["1212"],
+            "112012": ["1212"],
             "112015": ["1212"],
-            "112016": ["1211"],
-            "112017": ["1211"],
-            "112019": ["1211"],
+            "112003": ["1212"],
+            "112004": ["1212"],
+            "112011": ["1212"],
             "112090": ["1212"],
             "None": ["1111"],
         },
@@ -208,20 +208,20 @@ def test_get_mappings_nan_float():
             111201.0: [1112.0],
             111202.0: [1112.0],
             111301.0: [1112.0],
-            111401.0: [1122.0],
             111402.0: [1121.0],
+            111401.0: [1122.0],
             111403.0: [1122.0],
             111404.0: [1122.0],
             111405.0: [1123.0],
-            112001.0: [1212.0],
-            112002.0: [1212.0],
-            112006.0: [1212.0],
             112007.0: [1211.0],
-            112008.0: [1212.0],
-            112013.0: [1212.0],
             112016.0: [1211.0],
             112017.0: [1211.0],
             112019.0: [1211.0],
+            112002.0: [1212.0],
+            112013.0: [1212.0],
+            112001.0: [1212.0],
+            112006.0: [1212.0],
+            112008.0: [1212.0],
             112090.0: [1212.0],
             NaN: [1111.0],
         },
@@ -270,4 +270,4 @@ def test_cat_apply_freq():
         get_freqs(occup.code[occup.year == 2010].map(str).to_list()),
     )["3417"]
     expected = [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-    assert actual == expected
+    assert str(actual) == str(expected)