Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ddfg add randc function #86

Merged
merged 6 commits into from
Aug 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 55 additions & 5 deletions impyute/dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
""" Shared functions to load/generate data """
import numpy as np
import string
import random
import math
eltonlaw marked this conversation as resolved.
Show resolved Hide resolved
import itertools
from impyute.dataset.corrupt import Corruptor
from impyute.util import BadInputError

def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
""" Return randomly generated dataset of numbers with uniformly
Expand All @@ -15,8 +20,8 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int")
shape:tuple(optional)
Size of the randomly generated data
missingness: ('mcar', 'mar', 'mnar')
Type of missigness you want in your dataset
th: float between [0,1]
Type of missingness you want in your dataset
thr: float between [0,1]
Percentage of missing data in generated data
dtype: ('int','float')
Type of data
Expand Down Expand Up @@ -45,8 +50,8 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
shape:tuple(optional)
Size of the randomly generated data
missingness: ('mcar', 'mar', 'mnar')
Type of missigness you want in your dataset
th: float between [0,1]
Type of missingness you want in your dataset
thr: float between [0,1]
Percentage of missing data in generated data
dtype: ('int','float')
Type of data
Expand All @@ -65,6 +70,51 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
raw_data = getattr(corruptor, missingness)()
return raw_data

def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
""" Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)

Parameters
----------
nlevels: int
Specify the number of different categories in the dataset
shape: tuple(optional)
Size of the randomly generated data
missingness: string in ('mcar', 'mar', 'mnar')
Type of missingness you want in your dataset
thr: float between [0,1]
Percentage of missing data in generated data

Returns
-------
numpy.ndarray
"""
if shape[0]*shape[1] < nlevels:
raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")

length = len(string.ascii_lowercase)
n_fold = int(math.floor(math.log(nlevels, length)))
cat_pool = list(string.ascii_lowercase)

# when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
if n_fold > 0:
for i in range(2, n_fold+2):
eltonlaw marked this conversation as resolved.
Show resolved Hide resolved
pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
cat_pool.extend([''.join(w) for w in pool_candidate])
if len(cat_pool) > nlevels:
break

cat = random.sample(cat_pool, nlevels)
data = np.random.choice(cat, shape, replace=True)

# make sure the data frame has nlevel different categories
while len(np.unique(data)) != nlevels:
data = np.random.choice(cat, shape, replace=True)

corruptor = Corruptor(data, thr=thr, dtype=np.str)
raw_data = getattr(corruptor, missingness)()
return raw_data



def test_data(mask=np.zeros((3, 3), dtype=bool)):
""" Returns a dataset to use with tests (INTERNAL USE - FOR UNIT TESTING)
Expand Down Expand Up @@ -98,4 +148,4 @@ def mnist(missingness="mcar", thr=0.2):
dataset = fetch_mldata('MNIST original')
corruptor = Corruptor(dataset.data, thr=thr)
data = getattr(corruptor, missingness)()
return {"X": data, "Y": dataset.target}
return {"X": data, "Y": dataset.target}
6 changes: 3 additions & 3 deletions impyute/dataset/corrupt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Corruptor:
----------
data: np.ndarray
Matrix of values with no NaN's that you want to add NaN's to.
th: float (optional)
thr: float (optional)
The percentage of null values you want in your dataset, a number
between 0 and 1.

Expand All @@ -23,10 +23,10 @@ class Corruptor:
Overwrite values with MNAR placed NaN's.

"""
def __init__(self, data, thr=0.2):
def __init__(self, data, thr=0.2, dtype=np.float):
self.dtype = data.dtype
self.shape = np.shape(data)
self.data = data.astype(np.float)
self.data = data.astype(dtype)
self.thr = thr

def mcar(self):
Expand Down
26 changes: 26 additions & 0 deletions test/dataset/test_randc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import numpy as np
import pytest
from impyute.dataset.base import randc
from impyute.util import BadInputError

def test_raise_error_nlevel_exceed_shape():
with pytest.raises(BadInputError) as e:
randc(shape=(2, 2))
expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape"
assert str(e.value) == expected

@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))])
def test_nlevel_categories(nlevels, shape):
"""ideally the returned matrix should have nlevel+1 different categories, +1 because the Corrupt class introduce np.nan
however, if the missing value introduced by Corrupt class happens to replace a group of categories, the unique
category number would be < nlevel + 1
"""
dataframe = randc(nlevels, shape)
assert len(np.unique(dataframe)) <= nlevels + 1


@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3, 4)), (100, (20, 20))])
def test_dataframe_shape(nlevels, shape):
"""test if the returned data frame has desired shape"""
dataframe = randc(nlevels, shape)
assert dataframe.shape == shape