Skip to content

Commit

Permalink
missings (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
Polkas authored Oct 23, 2023
1 parent 9d5d12d commit 604012c
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 65 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Changelog

## v0.1.4.9002
## v0.1.4.9004

- Improved the lack of support for NaN and None in the `get_mappings`.
- Fixed a bug that `cat2cat_ml.features` can be only a `list` not a `Sequence`.
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@

There is offered the cat2cat procedure to map a categorical variable according to a mapping (transition) table between two different time points. The mapping (transition) table should to have a candidate for each category from the targeted for an update period. The main rule is to replicate the observation if it could be assigned to a few categories, then using simple frequencies or statistical methods to approximate probabilities of being assigned to each of them.

This algorithm was invented and implemented in the paper by (Nasinski, Majchrowska and Broniatowska (2020) doi:10.24425/cejeme.2020.134747).
**This algorithm was invented and implemented in the paper by [(Nasinski, Majchrowska and Broniatowska (2020))](https://doi.org/10.24425/cejeme.2020.134747).**

**For more details please read the paper by [(Nasinski, Gajowniczek (2023))](https://doi.org/10.1016/j.softx.2023.101525).**

## Installation

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cat2cat"
version = "0.1.4.9003"
version = "0.1.4.9004"
description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
authors = ["Maciej Nasinski"]
license = "MIT"
Expand Down
8 changes: 2 additions & 6 deletions src/cat2cat/cat2cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,9 @@ def cat2cat(
When ml model is broken then weights from simple frequencies are taken.
`knn` method is recommended for smaller datasets.
2. Please be sure that the categorical variable is of the same type in all cases.
2. Please be sure that the categorical variable is of the same type in all places.
`mappings.trans` arg columns and the `data.cat_var` column have to be of the same type.
When ml part applied `ml.cat_var` has to have the same type too.
3. Please covert all numpy.NaN to some numeric value like 999999.
None`s in a pandas column have to be converted to a "None" character.
When ml part is applied then `ml.cat_var` has to have the same type too.
Changes have to be made at the same time for the mapping table and datasets.
>>> from cat2cat import cat2cat
Expand Down Expand Up @@ -156,7 +153,6 @@ def _cat2cat_ml(
for target_cat in list(mapp.keys()):
base_cats = mapp[target_cat]
ml_cat_var = ml.data[ml.cat_var]

if (not any(in1d(base_cats, ml_cat_var.unique()))) or (len(base_cats) == 1):
continue

Expand Down
18 changes: 9 additions & 9 deletions src/cat2cat/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from numpy import ndarray, unique, repeat, array, round, unique, sort, isnan

from collections.abc import Iterable
from collections import OrderedDict
from typing import Union, Optional, Any, List, Dict, Sequence

__all__ = ["get_mappings", "cat_apply_freq", "get_freqs"]
Expand All @@ -22,20 +23,19 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
Returns:
Dict[str, Dict[Any, List[Any]]]: dict with 2 internal dicts, `to_old` and `to_new`.
Note:
Please covert all numpy.NaN to some numeric value like 999999.
None`s in a pandas column have to be converted to a "None" character.
There was made an effort to handle missings properly but please try to avoid of using NaN or None.
>>> from cat2cat.mappings import get_mappings
>>> from numpy import array
>>> from numpy import array, nan
>>> trans = array([
... [1111, 111101], [1111, 111102], [1123, 111405],
... [1212, 112006], [1212, 112008], [1212, 112090],
... [1111, 111101], [1111, 111102], [1123, 111405], [nan, 111405],
... [1212, 112006], [1212, 112008], [1212, 112090], [1212, nan],
... ])
>>> mappings = get_mappings(trans)
>>> mappings["to_old"]
{112006: [1212], 112008: [1212], 111405: [1123], 112090: [1212], 111101: [1111], 111102: [1111]}
{111101.0: [1111.0], 111102.0: [1111.0], 111405.0: [1123.0, nan], 112006.0: [1212.0], 112008.0: [1212.0], 112090.0: [1212.0], nan: [1212.0]}
>>> mappings["to_new"]
{1123: [111405], 1212: [112006, 112008, 112090], 1111: [111101, 111102]}
{1111.0: [111101.0, 111102.0], 1123.0: [111405.0], 1212.0: [112006.0, 112008.0, 112090.0, nan], nan: [111405.0]}
"""

assert (len(x.shape) == 2) and (
Expand All @@ -62,8 +62,8 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
ff[which_ff_null | (ff == None)] = "None"
ss[which_ss_null | (ss == None)] = "None"

from_old = unique(ff)
from_new = unique(ss)
from_old = list(OrderedDict.fromkeys(ff))
from_new = list(OrderedDict.fromkeys(ss))

to_old = dict()
for e in from_new:
Expand Down
7 changes: 3 additions & 4 deletions tests/test_cat2cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
from cat2cat.cat2cat_utils import dummy_c2c
from pandas import concat, DataFrame
from numpy import round, setdiff1d
from numpy import round, setdiff1d, nan
import pytest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
Expand All @@ -30,12 +30,11 @@ def int_round(x: float) -> int:
trans = load_trans()
# impute missing values
trans = concat(
[trans, DataFrame({"old": "99999", "new": setdiff1d(o_new.code, trans.new)})]
[trans, DataFrame({"old": nan, "new": setdiff1d(o_new.code, trans.new)})]
)

trans_int = trans.copy()
trans_int.loc[trans_int["old"].isnull(), "old"] = "99999"
trans_int = trans_int.astype({"old": int, "new": int})
trans_int = trans_int.astype({"old": "Int64", "new": "Int64"})

nr_rows_old = {"backward": 227662, "forward": 17223}
nr_rows_new = {"backward": 17323, "forward": 18680}
Expand Down
86 changes: 43 additions & 43 deletions tests/test_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,28 +37,28 @@
def test_get_freqs_range():
actual = get_freqs(list(range(10)))
expected = {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
assert actual == expected
assert str(actual) == str(expected)


def test_get_freqs_random_array():
seed(1234)
actual = get_freqs(choice(5, 100, replace=True))
expected = {0: 14, 1: 25, 2: 21, 3: 17, 4: 23}
assert actual == expected
assert str(actual) == str(expected)


def test_get_freqs_random_list():
seed(1234)
actual = get_freqs(list(choice(5, 100, replace=True)))
expected = {0: 14, 1: 25, 2: 21, 3: 17, 4: 23}
assert actual == expected
assert str(actual) == str(expected)


def test_get_freqs_multiplier():
seed(1234)
actual = get_freqs(choice(5, 100, replace=True), choice(5, 100, replace=True))
expected = {0: 25, 1: 60, 2: 40, 3: 27, 4: 43}
assert actual == expected
assert str(actual) == str(expected)


def test_get_freqs_multiplier_len():
Expand All @@ -74,39 +74,39 @@ def test_get_mappings_array():
actual = get_mappings(array(trans_small))
expected = {
"to_old": {
112001: [1212],
112002: [1212],
112006: [1212],
112007: [1211],
112008: [1212],
112013: [1212],
112016: [1211],
112017: [1211],
112019: [1211],
111401: [1122],
111101: [1111],
111102: [1111],
111103: [1111],
111201: [1112],
111202: [1112],
111301: [1112],
111402: [1121],
111401: [1122],
111403: [1122],
111404: [1122],
111405: [1123],
111301: [1112],
112007: [1211],
112016: [1211],
112017: [1211],
112019: [1211],
112002: [1212],
112013: [1212],
112001: [1212],
112006: [1212],
112008: [1212],
112090: [1212],
111201: [1112],
111202: [1112],
111101: [1111],
111102: [1111],
111103: [1111],
},
"to_new": {
1111: [111101, 111102, 111103],
1112: [111201, 111202, 111301],
1121: [111402],
1122: [111401, 111403, 111404],
1123: [111405],
1111: [111101, 111102, 111103],
1112: [111201, 111202, 111301],
1211: [112007, 112016, 112017, 112019],
1212: [112001, 112002, 112006, 112008, 112013, 112090],
},
}
assert actual == expected
assert str(actual) == str(expected)


def test_get_mappings_DataFrame():
Expand All @@ -120,7 +120,7 @@ def test_get_mappings_DataFrame():
"331508",
"333902",
]
assert actual == expected
assert str(actual) == str(expected)


# test with NaNs
Expand All @@ -139,29 +139,29 @@ def test_get_mappings_nan_str():
"111201": ["1112"],
"111202": ["1112"],
"111301": ["1112"],
"111401": ["1122"],
"111402": ["1121"],
"111401": ["1122"],
"111403": ["1122"],
"111404": ["1122"],
"111405": ["1123"],
"112001": ["1212"],
"112007": ["1211"],
"112016": ["1211"],
"112017": ["1211"],
"112019": ["1211"],
"112002": ["1212"],
"112003": ["1212"],
"112004": ["1212"],
"112005": ["1212"],
"112013": ["1212"],
"112001": ["1212"],
"112006": ["1212"],
"112007": ["1211"],
"112008": ["1212"],
"112009": ["1212"],
"112010": ["1212"],
"112011": ["1212"],
"112012": ["1212"],
"112013": ["1212"],
"112014": ["1212"],
"112005": ["1212"],
"112012": ["1212"],
"112015": ["1212"],
"112016": ["1211"],
"112017": ["1211"],
"112019": ["1211"],
"112003": ["1212"],
"112004": ["1212"],
"112011": ["1212"],
"112090": ["1212"],
"None": ["1111"],
},
Expand Down Expand Up @@ -208,20 +208,20 @@ def test_get_mappings_nan_float():
111201.0: [1112.0],
111202.0: [1112.0],
111301.0: [1112.0],
111401.0: [1122.0],
111402.0: [1121.0],
111401.0: [1122.0],
111403.0: [1122.0],
111404.0: [1122.0],
111405.0: [1123.0],
112001.0: [1212.0],
112002.0: [1212.0],
112006.0: [1212.0],
112007.0: [1211.0],
112008.0: [1212.0],
112013.0: [1212.0],
112016.0: [1211.0],
112017.0: [1211.0],
112019.0: [1211.0],
112002.0: [1212.0],
112013.0: [1212.0],
112001.0: [1212.0],
112006.0: [1212.0],
112008.0: [1212.0],
112090.0: [1212.0],
NaN: [1111.0],
},
Expand Down Expand Up @@ -270,4 +270,4 @@ def test_cat_apply_freq():
get_freqs(occup.code[occup.year == 2010].map(str).to_list()),
)["3417"]
expected = [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
assert actual == expected
assert str(actual) == str(expected)

0 comments on commit 604012c

Please sign in to comment.