Skip to content

Commit

Permalink
Inherit metadata when resampling and grouping (#23)
Browse files Browse the repository at this point in the history
This PR drops support for pandas<1.1.5 and adds various tests to ensure metadata is propagated on various operations.
Specific code changes:

- Every constructor now calls finalize.
- for_each_belief_ is no longer passed self as the df to work on.

* Call finalize in all constructors to inherit metadata.
Prepare tests for upcoming pandas==1.1.1 functionality: inherit metadata when resampling and grouping.

* Update reference in test.
Add pandas test.

* Finish rebase.

* Complete the ordering of BeliefSources by name.

* Fix computation order.

* Workaround for aggregation function when resampling.

* Fix tests:
- Separate tests for mean resampling and aggregate resampling.
- Separate tests for temporary attributes and subclass attributes.
- Test metadata propagation for groupby of subclassed DataFrames.

* Update pandas dependency.
Drop support for pandas<1.1.5.
Bump timely-beliefs version with major release.

* Prepare dtype test for empty frames.

* Add comment about which pandas version fixed test_groupby_retains_metadata.

* Rename downsampling function.

* Simplify calls to for_each_belief.

* Fix plotting integer values (int64 is not JSON serializable).

* Add reference in test docs to relevant pandas issue.

* Issue 35 metadata lost on multiplication (#43)

This PR adds no additional logic, only a test to check whether Issue #35 is successfully resolved (plus some refactoring of test util functions). Note that this is a merge into resample-while-keeping-metadata, and that branch (with PR #23) actually contains the logic that resolves this issue (as a side effect, because I was actually addressing deeper issues there: #22 and #26).

* Refactor metadata propagation checks to util function.

* Add test for metadata propagation upon multiplication (GH 35).

Co-authored-by: F.N. Claessen <[email protected]>

Co-authored-by: Felix Claessen <[email protected]>
  • Loading branch information
Flix6x and Flix6x authored Dec 21, 2020
1 parent a56396a commit 103fd2f
Show file tree
Hide file tree
Showing 9 changed files with 283 additions and 48 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ This will create an interactive Vega-Lite chart like the one in the screenshot a

## Development

We welcome other contributions to timely_beliefs.
The `timely_beliefs` package runs on `pandas>=1.1.5`.
Contact us if you need support for older versions.
We welcome other contributions to `timely_beliefs`.

[See our developer docs for details.](dev/dev.md)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
"uncertainty",
"lineage",
],
version="0.1.3",
version="1.0.0",
install_requires=[
"pytz",
"pandas>=0.24,<1.1", # test_groupby_preserves_metadata fails on 1.1
"pandas>=1.1.5",
"numpy",
"pyerf",
"SQLAlchemy",
Expand Down
5 changes: 3 additions & 2 deletions timely_beliefs/beliefs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from datetime import datetime, timedelta
from typing import List

import pandas as pd
from pandas.api.extensions import register_dataframe_accessor


Expand Down Expand Up @@ -104,7 +105,7 @@ def number_of_belief_times(self):
def number_of_beliefs(self) -> int:
"""Return the total number of beliefs in the BeliefsDataFrame, including both deterministic beliefs (which
require a single row) and probabilistic beliefs (which require multiple rows)."""
return len(self._obj.for_each_belief(df=self._obj))
return len(self._obj.for_each_belief())

@property
def sources(self) -> List[int]:
Expand All @@ -120,7 +121,7 @@ def number_of_sources(self):
@property
def number_of_probabilistic_beliefs(self) -> int:
"""Return the number of beliefs in the BeliefsDataFrame that are probabilistic (more than 1 unique value)."""
df = self._obj.for_each_belief(df=self._obj).nunique(dropna=True)
df = self._obj.for_each_belief(fnc=pd.DataFrame.nunique, dropna=True)
return len(df[df > 1].max(axis=1).dropna())

@property
Expand Down
79 changes: 55 additions & 24 deletions timely_beliefs/beliefs/classes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import math
from datetime import datetime, timedelta
from typing import Any, Callable, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import altair as alt
import numpy as np
import pandas as pd
from pandas.core.groupby import DataFrameGroupBy
from sqlalchemy import Column, DateTime, Float, ForeignKey, Integer, Interval
Expand Down Expand Up @@ -375,7 +374,11 @@ class BeliefsSeries(pd.Series):

@property
def _constructor(self):
return BeliefsSeries
def f(*args, **kwargs):
""" Call __finalize__() after construction to inherit metadata. """
return BeliefsSeries(*args, **kwargs).__finalize__(self, method="inherit")

return f

@property
def _constructor_expanddim(self):
Expand Down Expand Up @@ -436,7 +439,13 @@ class BeliefsDataFrame(pd.DataFrame):

@property
def _constructor(self):
return BeliefsDataFrame
def f(*args, **kwargs):
""" Call __finalize__() after construction to inherit metadata. """
return BeliefsDataFrame(*args, **kwargs).__finalize__(
self, method="inherit"
)

return f

@property
def _constructor_sliced(self):
Expand Down Expand Up @@ -1002,32 +1011,28 @@ def resample_events(
and keep_only_most_recent_belief
and df.lineage.number_of_sources == 1
):
df = df.reset_index(
level=[belief_timing_col, "source", "cumulative_probability"]
)
if event_resolution > self.event_resolution:
# downsample
df = df.resample(event_resolution).agg(
{
"event_value": np.nanmean,
"source": "first", # keep the only source
belief_timing_col: "max"
if belief_timing_col == "belief_time"
else "min", # keep only most recent belief
"cumulative_probability": "prod", # assume independent variables
}
)
# make a new BeliefsDataFrame, because agg() doesn't behave nicely for subclassed DataFrames
df = BeliefsDataFrame(
df.reset_index(),
sensor=self.sensor,
event_resolution=event_resolution,
column_functions = {
"event_value": "mean",
"source": "first", # keep the only source
belief_timing_col: "max"
if belief_timing_col == "belief_time"
else "min", # keep only most recent belief
"cumulative_probability": "prod", # assume independent variables
}
df = downsample_beliefs_data_frame(
df, event_resolution, column_functions
)
df.event_resolution = event_resolution
else:
# upsample
df = df.reset_index(
level=[belief_timing_col, "source", "cumulative_probability"]
)
new_index = pd.date_range(
start=df.index[0],
periods=len(df) * self.event_resolution // event_resolution,
periods=len(df) * (self.event_resolution // event_resolution),
freq=event_resolution,
name="event_start",
)
Expand Down Expand Up @@ -1454,7 +1459,7 @@ def set_columns_and_indices_for_empty_frame(df, columns, indices, default_types)
elif default_types[col] in (int, float):
df[col] = pd.to_numeric(df[col])

df.set_index(indices, inplace=True)
df.set_index(indices, inplace=True) # todo: pandas GH30517


def assign_sensor_and_event_resolution(df, sensor, event_resolution):
Expand All @@ -1467,3 +1472,29 @@ def assign_sensor_and_event_resolution(df, sensor, event_resolution):
if sensor
else None
)


def downsample_beliefs_data_frame(
df: BeliefsDataFrame, event_resolution: timedelta, col_att_dict: Dict[str, str]
) -> BeliefsDataFrame:
"""Because df.resample().agg() doesn't behave nicely for subclassed DataFrames,
we aggregate each index level and column separately against the resampled event_start level,
and then recombine them afterwards.
"""
belief_timing_col = (
"belief_time" if "belief_time" in df.index.names else "belief_horizon"
)
event_timing_col = "event_start" if "event_start" in df.index.names else "event_end"
return pd.concat(
[
getattr(
df.reset_index()
.set_index(event_timing_col)[col]
.to_frame()
.resample(event_resolution),
att,
)()
for col, att in col_att_dict.items()
],
axis=1,
).set_index([belief_timing_col, "source", "cumulative_probability"], append=True)
4 changes: 3 additions & 1 deletion timely_beliefs/sources/classes.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from functools import total_ordering
from typing import Union

from sqlalchemy import Column, Integer, String

from timely_beliefs.db_base import Base


@total_ordering
class BeliefSource(object):

"""
A belief source is any data-creating entitiy such as a user, a ML model or a script.
A belief source is any data-creating entity such as a user, a ML model or a script.
"""

name: str
Expand Down
Loading

0 comments on commit 103fd2f

Please sign in to comment.