From 604012c821444c41dc584a33c86594f93855a36f Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Mon, 23 Oct 2023 22:11:28 +0200 Subject: [PATCH] missings (#8) --- CHANGELOG.md | 2 +- README.md | 4 +- pyproject.toml | 2 +- src/cat2cat/cat2cat.py | 8 +--- src/cat2cat/mappings.py | 18 ++++----- tests/test_cat2cat.py | 7 ++-- tests/test_mappings.py | 86 ++++++++++++++++++++--------------------- 7 files changed, 62 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d163bb3..afd918a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## v0.1.4.9002 +## v0.1.4.9004 - Improved the lack of support for NaN and None in the `get_mappings`. - Fixed a bug that `cat2cat_ml.features` can be only a `list` not a `Sequence`. diff --git a/README.md b/README.md index e02f501..3da9536 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ There is offered the cat2cat procedure to map a categorical variable according to a mapping (transition) table between two different time points. The mapping (transition) table should to have a candidate for each category from the targeted for an update period. The main rule is to replicate the observation if it could be assigned to a few categories, then using simple frequencies or statistical methods to approximate probabilities of being assigned to each of them. -This algorithm was invented and implemented in the paper by (Nasinski, Majchrowska and Broniatowska (2020) doi:10.24425/cejeme.2020.134747). +**This algorithm was invented and implemented in the paper by [(Nasinski, Majchrowska and Broniatowska (2020))](https://doi.org/10.24425/cejeme.2020.134747).** + +**For more details please read the paper by [(Nasinski, Gajowniczek (2023))](https://doi.org/10.1016/j.softx.2023.101525).** ## Installation diff --git a/pyproject.toml b/pyproject.toml index 9d4789f..bb29d0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cat2cat" -version = "0.1.4.9003" +version = "0.1.4.9004" description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset." authors = ["Maciej Nasinski"] license = "MIT" diff --git a/src/cat2cat/cat2cat.py b/src/cat2cat/cat2cat.py index 67bb542..24cdaf0 100644 --- a/src/cat2cat/cat2cat.py +++ b/src/cat2cat/cat2cat.py @@ -33,12 +33,9 @@ def cat2cat( When ml model is broken then weights from simple frequencies are taken. `knn` method is recommended for smaller datasets. - 2. Please be sure that the categorical variable is of the same type in all cases. + 2. Please be sure that the categorical variable is of the same type in all places. `mappings.trans` arg columns and the `data.cat_var` column have to be of the same type. - When ml part applied `ml.cat_var` has to have the same type too. - - 3. Please covert all numpy.NaN to some numeric value like 999999. - None`s in a pandas column have to be converted to a "None" character. + When ml part is applied then `ml.cat_var` has to have the same type too. Changes have to be made at the same time for the mapping table and datasets. >>> from cat2cat import cat2cat @@ -156,7 +153,6 @@ def _cat2cat_ml( for target_cat in list(mapp.keys()): base_cats = mapp[target_cat] ml_cat_var = ml.data[ml.cat_var] - if (not any(in1d(base_cats, ml_cat_var.unique()))) or (len(base_cats) == 1): continue diff --git a/src/cat2cat/mappings.py b/src/cat2cat/mappings.py index 8659a50..01762a6 100644 --- a/src/cat2cat/mappings.py +++ b/src/cat2cat/mappings.py @@ -2,6 +2,7 @@ from numpy import ndarray, unique, repeat, array, round, unique, sort, isnan from collections.abc import Iterable +from collections import OrderedDict from typing import Union, Optional, Any, List, Dict, Sequence __all__ = ["get_mappings", "cat_apply_freq", "get_freqs"] @@ -22,20 +23,19 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]] Returns: Dict[str, Dict[Any, List[Any]]]: dict with 2 internal dicts, `to_old` and `to_new`. Note: - Please covert all numpy.NaN to some numeric value like 999999. - None`s in a pandas column have to be converted to a "None" character. + There was made an effort to handle missings properly but please try to avoid of using NaN or None. >>> from cat2cat.mappings import get_mappings - >>> from numpy import array + >>> from numpy import array, nan >>> trans = array([ - ... [1111, 111101], [1111, 111102], [1123, 111405], - ... [1212, 112006], [1212, 112008], [1212, 112090], + ... [1111, 111101], [1111, 111102], [1123, 111405], [nan, 111405], + ... [1212, 112006], [1212, 112008], [1212, 112090], [1212, nan], ... ]) >>> mappings = get_mappings(trans) >>> mappings["to_old"] - {112006: [1212], 112008: [1212], 111405: [1123], 112090: [1212], 111101: [1111], 111102: [1111]} + {111101.0: [1111.0], 111102.0: [1111.0], 111405.0: [1123.0, nan], 112006.0: [1212.0], 112008.0: [1212.0], 112090.0: [1212.0], nan: [1212.0]} >>> mappings["to_new"] - {1123: [111405], 1212: [112006, 112008, 112090], 1111: [111101, 111102]} + {1111.0: [111101.0, 111102.0], 1123.0: [111405.0], 1212.0: [112006.0, 112008.0, 112090.0, nan], nan: [111405.0]} """ assert (len(x.shape) == 2) and ( @@ -62,8 +62,8 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]] ff[which_ff_null | (ff == None)] = "None" ss[which_ss_null | (ss == None)] = "None" - from_old = unique(ff) - from_new = unique(ss) + from_old = list(OrderedDict.fromkeys(ff)) + from_new = list(OrderedDict.fromkeys(ss)) to_old = dict() for e in from_new: diff --git a/tests/test_cat2cat.py b/tests/test_cat2cat.py index ffb8522..57e5bb0 100644 --- a/tests/test_cat2cat.py +++ b/tests/test_cat2cat.py @@ -3,7 +3,7 @@ from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml from cat2cat.cat2cat_utils import dummy_c2c from pandas import concat, DataFrame -from numpy import round, setdiff1d +from numpy import round, setdiff1d, nan import pytest from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.tree import DecisionTreeClassifier @@ -30,12 +30,11 @@ def int_round(x: float) -> int: trans = load_trans() # impute missing values trans = concat( - [trans, DataFrame({"old": "99999", "new": setdiff1d(o_new.code, trans.new)})] + [trans, DataFrame({"old": nan, "new": setdiff1d(o_new.code, trans.new)})] ) trans_int = trans.copy() -trans_int.loc[trans_int["old"].isnull(), "old"] = "99999" -trans_int = trans_int.astype({"old": int, "new": int}) +trans_int = trans_int.astype({"old": "Int64", "new": "Int64"}) nr_rows_old = {"backward": 227662, "forward": 17223} nr_rows_new = {"backward": 17323, "forward": 18680} diff --git a/tests/test_mappings.py b/tests/test_mappings.py index b3335e9..c9e59ec 100644 --- a/tests/test_mappings.py +++ b/tests/test_mappings.py @@ -37,28 +37,28 @@ def test_get_freqs_range(): actual = get_freqs(list(range(10))) expected = {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1} - assert actual == expected + assert str(actual) == str(expected) def test_get_freqs_random_array(): seed(1234) actual = get_freqs(choice(5, 100, replace=True)) expected = {0: 14, 1: 25, 2: 21, 3: 17, 4: 23} - assert actual == expected + assert str(actual) == str(expected) def test_get_freqs_random_list(): seed(1234) actual = get_freqs(list(choice(5, 100, replace=True))) expected = {0: 14, 1: 25, 2: 21, 3: 17, 4: 23} - assert actual == expected + assert str(actual) == str(expected) def test_get_freqs_multiplier(): seed(1234) actual = get_freqs(choice(5, 100, replace=True), choice(5, 100, replace=True)) expected = {0: 25, 1: 60, 2: 40, 3: 27, 4: 43} - assert actual == expected + assert str(actual) == str(expected) def test_get_freqs_multiplier_len(): @@ -74,39 +74,39 @@ def test_get_mappings_array(): actual = get_mappings(array(trans_small)) expected = { "to_old": { - 112001: [1212], - 112002: [1212], - 112006: [1212], - 112007: [1211], - 112008: [1212], - 112013: [1212], - 112016: [1211], - 112017: [1211], - 112019: [1211], - 111401: [1122], + 111101: [1111], + 111102: [1111], + 111103: [1111], + 111201: [1112], + 111202: [1112], + 111301: [1112], 111402: [1121], + 111401: [1122], 111403: [1122], 111404: [1122], 111405: [1123], - 111301: [1112], + 112007: [1211], + 112016: [1211], + 112017: [1211], + 112019: [1211], + 112002: [1212], + 112013: [1212], + 112001: [1212], + 112006: [1212], + 112008: [1212], 112090: [1212], - 111201: [1112], - 111202: [1112], - 111101: [1111], - 111102: [1111], - 111103: [1111], }, "to_new": { + 1111: [111101, 111102, 111103], + 1112: [111201, 111202, 111301], 1121: [111402], 1122: [111401, 111403, 111404], 1123: [111405], - 1111: [111101, 111102, 111103], - 1112: [111201, 111202, 111301], 1211: [112007, 112016, 112017, 112019], 1212: [112001, 112002, 112006, 112008, 112013, 112090], }, } - assert actual == expected + assert str(actual) == str(expected) def test_get_mappings_DataFrame(): @@ -120,7 +120,7 @@ def test_get_mappings_DataFrame(): "331508", "333902", ] - assert actual == expected + assert str(actual) == str(expected) # test with NaNs @@ -139,29 +139,29 @@ def test_get_mappings_nan_str(): "111201": ["1112"], "111202": ["1112"], "111301": ["1112"], - "111401": ["1122"], "111402": ["1121"], + "111401": ["1122"], "111403": ["1122"], "111404": ["1122"], "111405": ["1123"], - "112001": ["1212"], + "112007": ["1211"], + "112016": ["1211"], + "112017": ["1211"], + "112019": ["1211"], "112002": ["1212"], - "112003": ["1212"], - "112004": ["1212"], - "112005": ["1212"], + "112013": ["1212"], + "112001": ["1212"], "112006": ["1212"], - "112007": ["1211"], "112008": ["1212"], "112009": ["1212"], "112010": ["1212"], - "112011": ["1212"], - "112012": ["1212"], - "112013": ["1212"], "112014": ["1212"], + "112005": ["1212"], + "112012": ["1212"], "112015": ["1212"], - "112016": ["1211"], - "112017": ["1211"], - "112019": ["1211"], + "112003": ["1212"], + "112004": ["1212"], + "112011": ["1212"], "112090": ["1212"], "None": ["1111"], }, @@ -208,20 +208,20 @@ def test_get_mappings_nan_float(): 111201.0: [1112.0], 111202.0: [1112.0], 111301.0: [1112.0], - 111401.0: [1122.0], 111402.0: [1121.0], + 111401.0: [1122.0], 111403.0: [1122.0], 111404.0: [1122.0], 111405.0: [1123.0], - 112001.0: [1212.0], - 112002.0: [1212.0], - 112006.0: [1212.0], 112007.0: [1211.0], - 112008.0: [1212.0], - 112013.0: [1212.0], 112016.0: [1211.0], 112017.0: [1211.0], 112019.0: [1211.0], + 112002.0: [1212.0], + 112013.0: [1212.0], + 112001.0: [1212.0], + 112006.0: [1212.0], + 112008.0: [1212.0], 112090.0: [1212.0], NaN: [1111.0], }, @@ -270,4 +270,4 @@ def test_cat_apply_freq(): get_freqs(occup.code[occup.year == 2010].map(str).to_list()), )["3417"] expected = [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] - assert actual == expected + assert str(actual) == str(expected)