Skip to content

Commit

Permalink
handle missings (#13)
Browse files Browse the repository at this point in the history
* handle missings
  • Loading branch information
Polkas authored Dec 23, 2023
1 parent 62d8228 commit 4de1583
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 25 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Changelog

## v0.1.4.9008
## v0.1.4.9009

- New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent.
- Improved the lack of support for NaN and None in the `get_mappings`.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ authors = [
]
description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
readme = "README.md"
version = "0.1.4.9008"
version = "0.1.4.9009"
requires-python = ">=3.8"
keywords = ["panel", "categorical", "longtitudal", "inconsistent", "cat2cat"]
license = {text = "Apache License 2.0 | file LICENSE"}
Expand Down
4 changes: 4 additions & 0 deletions src/cat2cat/cat2cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ def cat2cat(
`mappings.trans` arg columns and the `data.cat_var` column have to be of the same type.
When ml part is applied then `ml.cat_var` has to have the same type too.
Changes have to be made at the same time for the mapping table and datasets.
3. Missing values in the mapping table or categorical variable can cause problems.
It is recommended to use string or float types in the mapping table and for categorical variable.
Alternative solution can be representing missing values as a specific number (9999) or string ("Missing").
>>> from cat2cat import cat2cat
>>> from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
Expand Down
2 changes: 1 addition & 1 deletion src/cat2cat/cat2cat_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,14 +237,14 @@ def cat2cat_ml_run(

return cat2cat_ml_run_results(res, mappings, ml, kwargs)


def _cat2cat_ml(
ml: cat2cat_ml, mapp: Dict[Any, Any], target_df: DataFrame, cat_var_target: str
) -> None:
"""cat2cat ml optional part"""
for target_cat in list(mapp.keys()):
base_cats = mapp[target_cat]
ml_cat_var = ml.data[ml.cat_var]

if (not any(in1d(base_cats, ml_cat_var.unique()))) or (len(base_cats) == 1):
continue

Expand Down
6 changes: 4 additions & 2 deletions src/cat2cat/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def get_mappings(x: Table) -> Dict[str, Dict[Any, List[Any]]]:
Dict[str, Dict[Any, List[Any]]]: dict with 2 internal dicts, `to_old` and `to_new`.
Note:
There was made an effort to handle missings properly but please try to avoid of using NaN or None.
It is recommended to use string or float types.
Alternative solution can be representing missing values as a specific number (9999) or string ("Missing").
>>> from cat2cat.mappings import get_mappings
>>> from numpy import array, nan
Expand All @@ -38,9 +40,9 @@ def get_mappings(x: Table) -> Dict[str, Dict[Any, List[Any]]]:
>>> mappings["to_new"]
{1111.0: [111101.0, 111102.0], 1123.0: [111405.0], 1212.0: [112006.0, 112008.0, 112090.0, nan], nan: [111405.0]}
"""
assert (len(x.shape) == 2) and (
assert hasattr(x, "shape") and ((len(x.shape) == 2) and (
x.shape[1] == 2
), "x should have 2 dimensions and the second one is equal to 2 (columns)"
)), "x should have 2 dimensions and the second one is equal to 2 (columns)"

if isinstance(x, DataFrame):
return get_mappings_df(x)
Expand Down
10 changes: 5 additions & 5 deletions tests/test_cat2cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def int_round(x: float) -> int:
)

trans_int = trans.copy()
trans_int = trans_int.astype({"old": "Int64", "new": "Int64"})
trans_int = trans_int.astype({"old": "float", "new": "float"})

nr_rows_old = {"backward": 227662, "forward": 17223}
nr_rows_new = {"backward": 17323, "forward": 18680}
Expand All @@ -59,7 +59,7 @@ def int_round(x: float) -> int:
},
}
which_target_origin = {"backward": ("old", "new"), "forward": ("new", "old")}
code_var_name = {"backward": "code", "forward": "code4"}
ml_test_data = {"backward": {"var": "code", "data": occup.loc[occup.year >= 2010, :].copy()}, "forward": {"var": "code4", "data": occup.loc[occup.year <= 2008, :].copy()}}


@pytest.mark.parametrize("direction", ["backward", "forward"])
Expand Down Expand Up @@ -148,7 +148,7 @@ def test_cat2cat_custom_freqs(direction, cat_type):
assert data_dict[cat_type]["new"].equals(n)

@pytest.mark.parametrize("cat_type", ["str", "int"])
@pytest.mark.parametrize("direction", ["backward"])
@pytest.mark.parametrize("direction", ["backward", "forward"])
def test_cat2cat_ml(direction, cat_type):
o = data_dict[cat_type]["old"].copy()
n = data_dict[cat_type]["new"].copy()
Expand All @@ -159,8 +159,8 @@ def test_cat2cat_ml(direction, cat_type):
data_dict[cat_type]["trans"], direction, data_dict[cat_type]["freqs"][direction]
)
ml = cat2cat_ml(
occup.loc[occup.year >= 2010, :].copy(),
code_var_name[direction],
ml_test_data[direction]["data"],
ml_test_data[direction]["var"],
["salary", "age", "edu", "sex"],
[DecisionTreeClassifier(), LinearDiscriminantAnalysis()],
)
Expand Down
30 changes: 15 additions & 15 deletions tests/test_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,21 @@ def test_get_mappings_DataFrame():
assert str(actual) == str(expected)


@pytest.mark.parametrize("x", [1, "", [], {}])
def test_get_mappings_wrong(x):
with pytest.raises(AssertionError):
get_mappings(x)


class class_with_shape:
shape = [1, 2]


def test_get_mappings_shape():
with pytest.raises(TypeError):
get_mappings(class_with_shape())


# test with NaNs
def test_get_mappings_nan_str():
trans2 = trans.copy()
Expand Down Expand Up @@ -239,21 +254,6 @@ def test_get_mappings_nan_float():
assert str(actual) == str(expected)


@pytest.mark.parametrize("x", [1, "", [], {}])
def test_get_mappings_wrong(x):
with pytest.raises(AttributeError):
get_mappings(x)


class class_with_shape:
shape = [1, 2]


def test_get_mappings_shape():
with pytest.raises(TypeError):
get_mappings(class_with_shape())


def test_get_mappings_different_types():
trans2 = trans.copy()
trans2["old"] = trans2["old"].astype(float)
Expand Down

0 comments on commit 4de1583

Please sign in to comment.