Skip to content

Commit

Permalink
Fill default values when parent is missing (#1312)
Browse files Browse the repository at this point in the history
* Calculating direct features use default value if parent missing

* lint fixes

* Minor fixes for other related unit-tests

* Adding pytest-fixtures for dask/koalas

* Include unit-test for dask/koalas frames

* Updating release notes for #1217

* fix merge issue

* only fill for default values that are non nan

* fix test

* update release notes

* revert unrelated changes

* revert unrelated changes

* remove uncovered/unnecessary pytest.skip

* rename fixtures

Co-authored-by: Serial Lazer <[email protected]>
  • Loading branch information
thehomebrewnerd and seriallazer authored Jan 28, 2021
1 parent 02d960f commit 5450641
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 4 deletions.
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Release Notes
**Future Release**
* Enhancements
* Fixes
* Calculate direct features uses default value if parent missing (:pr:`1312`)
* Changes
* Documentation Changes
* Update Twitter link to documentation toolbar (:pr:`1322`)
Expand All @@ -15,7 +16,7 @@ Release Notes
* Remove unnecessary test skips on Windows (:pr:`1320`)

Thanks to the following people for contributing to this release:
:user:`gsheni`, :user:`jeff-hernandez`, :user:`rwedge`, :user:`thehomebrewnerd`
:user:`gsheni`, :user:`jeff-hernandez`, :user:`rwedge`, :user:`thehomebrewnerd`, :user:`seriallazer`

**v0.23.0 Dec 31, 2020**
* Fixes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,12 @@ def _calculate_direct_features(self, features, child_df, df_trie, progress_callb
# new column names (in the child entity) for the merge
col_map = {relationship.parent_variable.id: merge_var}
index_as_feature = None

fillna_dict = {}
for f in features:
feature_defaults = {name: f.default_value
for name in f.get_feature_names() if not pd.isna(f.default_value)}
fillna_dict.update(feature_defaults)
if f.base_features[0].get_name() == relationship.parent_variable.id:
index_as_feature = f
base_names = f.base_features[0].get_feature_names()
Expand All @@ -565,7 +570,7 @@ def _calculate_direct_features(self, features, child_df, df_trie, progress_callb

progress_callback(len(features) / float(self.num_features))

return new_df
return new_df.fillna(fillna_dict)

def _calculate_agg_features(self, features, frame, df_trie, progress_callback):
test_feature = features[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -867,10 +867,12 @@ def test_empty_path_approximate_full(pd_es):
approximate=Timedelta(10, 's'),
cutoff_time=cutoff_time)
vals1 = feature_matrix[dfeat.get_name()].tolist()
assert np.isnan(vals1[0])
assert np.isnan(vals1[1])

assert (vals1[0] == 0)
assert (vals1[1] == 0)
assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]


# todo: do we need to test this situation?
# def test_empty_path_approximate_partial(pd_es):
# pd_es = copy.deepcopy(pd_es)
Expand Down Expand Up @@ -1828,6 +1830,21 @@ def test_calc_feature_matrix_with_cutoff_df_and_instance_ids(es):
assert (feature_matrix[property_feature.get_name()] == labels).values.all()


def test_calculate_feature_matrix_returns_default_values(default_value_es):
sum_features = ft.Feature(default_value_es["transactions"]["value"],
parent_entity=default_value_es["sessions"], primitive=Sum)
sessions_sum = ft.Feature(sum_features,
entity=default_value_es["transactions"])

feature_matrix = ft.calculate_feature_matrix(features=[sessions_sum],
entityset=default_value_es)

feature_matrix = to_pandas(feature_matrix, index='id', sort_index=True)
expected_values = [2.0, 2.0, 1.0, 0.0]

assert (feature_matrix[sessions_sum.get_name()] == expected_values).values.all()


def test_entities_relationships(entities, relationships):
fm_1, features = ft.dfs(entities=entities,
relationships=relationships,
Expand Down
58 changes: 58 additions & 0 deletions featuretools/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,64 @@ def ks_diamond_es(pd_diamond_es):
return ft.EntitySet(id=pd_diamond_es.id, entities=entities, relationships=relationships)


@pytest.fixture(params=['pd_default_value_es', 'dask_default_value_es', 'ks_default_value_es'])
def default_value_es(request):
return request.getfixturevalue(request.param)


@pytest.fixture
def pd_default_value_es():
transactions = pd.DataFrame({
"id": [1, 2, 3, 4],
"session_id": ["a", "a", "b", "c"],
"value": [1, 1, 1, 1]
})

sessions = pd.DataFrame({
"id": ["a", "b"]
})

es = ft.EntitySet()
es.entity_from_dataframe(entity_id="transactions",
dataframe=transactions,
index="id")
es.entity_from_dataframe(entity_id="sessions",
dataframe=sessions,
index="id")

es.add_relationship(ft.Relationship(es["sessions"]["id"], es["transactions"]["session_id"]))
return es


@pytest.fixture
def dask_default_value_es(pd_default_value_es):
entities = {}
for entity in pd_default_value_es.entities:
entities[entity.id] = (dd.from_pandas(entity.df, npartitions=4), entity.index, None, entity.variable_types)

relationships = [(rel.parent_entity.id,
rel.parent_variable.name,
rel.child_entity.id,
rel.child_variable.name) for rel in pd_default_value_es.relationships]

return ft.EntitySet(id=pd_default_value_es.id, entities=entities, relationships=relationships)


@pytest.fixture
def ks_default_value_es(pd_default_value_es):
ks = pytest.importorskip('databricks.koalas', reason="Koalas not installed, skipping")
entities = {}
for entity in pd_default_value_es.entities:
entities[entity.id] = (ks.from_pandas(pd_to_ks_clean(entity.df)), entity.index, None, entity.variable_types)

relationships = [(rel.parent_entity.id,
rel.parent_variable.name,
rel.child_entity.id,
rel.child_variable.name) for rel in pd_default_value_es.relationships]

return ft.EntitySet(id=pd_default_value_es.id, entities=entities, relationships=relationships)


@pytest.fixture(params=['pd_home_games_es', 'dask_home_games_es', 'ks_home_games_es'])
def home_games_es(request):
return request.getfixturevalue(request.param)
Expand Down

0 comments on commit 5450641

Please sign in to comment.