Inherit metadata when resampling and grouping (#23)

This PR drops support for pandas<1.1.5 and adds various tests to ensure metadata is propagated on various operations. Specific code changes: - Every constructor now calls finalize. - for_each_belief_ is no longer passed self as the df to work on. * Call finalize in all constructors to inherit metadata. Prepare tests for upcoming pandas==1.1.1 functionality: inherit metadata when resampling and grouping. * Update reference in test. Add pandas test. * Finish rebase. * Complete the ordering of BeliefSources by name. * Fix computation order. * Workaround for aggregation function when resampling. * Fix tests: - Separate tests for mean resampling and aggregate resampling. - Separate tests for temporary attributes and subclass attributes. - Test metadata propagation for groupby of subclassed DataFrames. * Update pandas dependency. Drop support for pandas<1.1.5. Bump timely-beliefs version with major release. * Prepare dtype test for empty frames. * Add comment about which pandas version fixed test_groupby_retains_metadata. * Rename downsampling function. * Simplify calls to for_each_belief. * Fix plotting integer values (int64 is not JSON serializable). * Add reference in test docs to relevant pandas issue. * Issue 35 metadata lost on multiplication (#43) This PR adds no additional logic, only a test to check whether Issue #35 is successfully resolved (plus some refactoring of test util functions). Note that this is a merge into resample-while-keeping-metadata, and that branch (with PR #23) actually contains the logic that resolves this issue (as a side effect, because I was actually addressing deeper issues there: #22 and #26). * Refactor metadata propagation checks to util function. * Add test for metadata propagation upon multiplication (GH 35). Co-authored-by: F.N. Claessen <[email protected]> Co-authored-by: Felix Claessen <[email protected]>
SeitaBV · Dec 21, 2020 · 103fd2f · 103fd2f
1 parent a56396a
commit 103fd2f
Show file tree

Hide file tree

Showing 9 changed files with 283 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -148,6 +148,8 @@ This will create an interactive Vega-Lite chart like the one in the screenshot a
 
 ## Development
 
-We welcome other contributions to timely_beliefs.
+The `timely_beliefs` package runs on `pandas>=1.1.5`.
+Contact us if you need support for older versions.
+We welcome other contributions to `timely_beliefs`.
 
 [See our developer docs for details.](dev/dev.md)
diff --git a/setup.py b/setup.py
@@ -14,10 +14,10 @@
         "uncertainty",
         "lineage",
     ],
-    version="0.1.3",
+    version="1.0.0",
     install_requires=[
         "pytz",
-        "pandas>=0.24,<1.1",  # test_groupby_preserves_metadata fails on 1.1
+        "pandas>=1.1.5",
         "numpy",
         "pyerf",
         "SQLAlchemy",

diff --git a/timely_beliefs/beliefs/__init__.py b/timely_beliefs/beliefs/__init__.py
@@ -8,6 +8,7 @@
 from datetime import datetime, timedelta
 from typing import List
 
+import pandas as pd
 from pandas.api.extensions import register_dataframe_accessor
 
 
@@ -104,7 +105,7 @@ def number_of_belief_times(self):
     def number_of_beliefs(self) -> int:
         """Return the total number of beliefs in the BeliefsDataFrame, including both deterministic beliefs (which
         require a single row) and probabilistic beliefs (which require multiple rows)."""
-        return len(self._obj.for_each_belief(df=self._obj))
+        return len(self._obj.for_each_belief())
 
     @property
     def sources(self) -> List[int]:
@@ -120,7 +121,7 @@ def number_of_sources(self):
     @property
     def number_of_probabilistic_beliefs(self) -> int:
         """Return the number of beliefs in the BeliefsDataFrame that are probabilistic (more than 1 unique value)."""
-        df = self._obj.for_each_belief(df=self._obj).nunique(dropna=True)
+        df = self._obj.for_each_belief(fnc=pd.DataFrame.nunique, dropna=True)
         return len(df[df > 1].max(axis=1).dropna())
 
     @property

diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py
@@ -1,9 +1,8 @@
 import math
 from datetime import datetime, timedelta
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import altair as alt
-import numpy as np
 import pandas as pd
 from pandas.core.groupby import DataFrameGroupBy
 from sqlalchemy import Column, DateTime, Float, ForeignKey, Integer, Interval
@@ -375,7 +374,11 @@ class BeliefsSeries(pd.Series):
 
     @property
     def _constructor(self):
-        return BeliefsSeries
+        def f(*args, **kwargs):
+            """ Call __finalize__() after construction to inherit metadata. """
+            return BeliefsSeries(*args, **kwargs).__finalize__(self, method="inherit")
+
+        return f
 
     @property
     def _constructor_expanddim(self):
@@ -436,7 +439,13 @@ class BeliefsDataFrame(pd.DataFrame):
 
     @property
     def _constructor(self):
-        return BeliefsDataFrame
+        def f(*args, **kwargs):
+            """ Call __finalize__() after construction to inherit metadata. """
+            return BeliefsDataFrame(*args, **kwargs).__finalize__(
+                self, method="inherit"
+            )
+
+        return f
 
     @property
     def _constructor_sliced(self):
@@ -1002,32 +1011,28 @@ def resample_events(
             and keep_only_most_recent_belief
             and df.lineage.number_of_sources == 1
         ):
-            df = df.reset_index(
-                level=[belief_timing_col, "source", "cumulative_probability"]
-            )
             if event_resolution > self.event_resolution:
                 # downsample
-                df = df.resample(event_resolution).agg(
-                    {
-                        "event_value": np.nanmean,
-                        "source": "first",  # keep the only source
-                        belief_timing_col: "max"
-                        if belief_timing_col == "belief_time"
-                        else "min",  # keep only most recent belief
-                        "cumulative_probability": "prod",  # assume independent variables
-                    }
-                )
-                # make a new BeliefsDataFrame, because agg() doesn't behave nicely for subclassed DataFrames
-                df = BeliefsDataFrame(
-                    df.reset_index(),
-                    sensor=self.sensor,
-                    event_resolution=event_resolution,
+                column_functions = {
+                    "event_value": "mean",
+                    "source": "first",  # keep the only source
+                    belief_timing_col: "max"
+                    if belief_timing_col == "belief_time"
+                    else "min",  # keep only most recent belief
+                    "cumulative_probability": "prod",  # assume independent variables
+                }
+                df = downsample_beliefs_data_frame(
+                    df, event_resolution, column_functions
                 )
+                df.event_resolution = event_resolution
             else:
                 # upsample
+                df = df.reset_index(
+                    level=[belief_timing_col, "source", "cumulative_probability"]
+                )
                 new_index = pd.date_range(
                     start=df.index[0],
-                    periods=len(df) * self.event_resolution // event_resolution,
+                    periods=len(df) * (self.event_resolution // event_resolution),
                     freq=event_resolution,
                     name="event_start",
                 )
@@ -1454,7 +1459,7 @@ def set_columns_and_indices_for_empty_frame(df, columns, indices, default_types)
         elif default_types[col] in (int, float):
             df[col] = pd.to_numeric(df[col])
 
-    df.set_index(indices, inplace=True)
+    df.set_index(indices, inplace=True)  # todo: pandas GH30517
 
 
 def assign_sensor_and_event_resolution(df, sensor, event_resolution):
@@ -1467,3 +1472,29 @@ def assign_sensor_and_event_resolution(df, sensor, event_resolution):
         if sensor
         else None
     )
+
+
+def downsample_beliefs_data_frame(
+    df: BeliefsDataFrame, event_resolution: timedelta, col_att_dict: Dict[str, str]
+) -> BeliefsDataFrame:
+    """Because df.resample().agg() doesn't behave nicely for subclassed DataFrames,
+    we aggregate each index level and column separately against the resampled event_start level,
+    and then recombine them afterwards.
+    """
+    belief_timing_col = (
+        "belief_time" if "belief_time" in df.index.names else "belief_horizon"
+    )
+    event_timing_col = "event_start" if "event_start" in df.index.names else "event_end"
+    return pd.concat(
+        [
+            getattr(
+                df.reset_index()
+                .set_index(event_timing_col)[col]
+                .to_frame()
+                .resample(event_resolution),
+                att,
+            )()
+            for col, att in col_att_dict.items()
+        ],
+        axis=1,
+    ).set_index([belief_timing_col, "source", "cumulative_probability"], append=True)
diff --git a/timely_beliefs/sources/classes.py b/timely_beliefs/sources/classes.py
@@ -1,14 +1,16 @@
+from functools import total_ordering
 from typing import Union
 
 from sqlalchemy import Column, Integer, String
 
 from timely_beliefs.db_base import Base
 
 
+@total_ordering
 class BeliefSource(object):
 
     """
-    A belief source is any data-creating entitiy such as a user, a ML model or a script.
+    A belief source is any data-creating entity such as a user, a ML model or a script.
     """
 
     name: str