From 7aad1c200809d79449c6242b773f4132b5d9f731 Mon Sep 17 00:00:00 2001 From: Felix Claessen <30658763+Flix6x@users.noreply.github.com> Date: Sat, 4 Nov 2023 16:07:30 +0100 Subject: [PATCH] Fix/read csv with dropna and resample (#153) * fix: move resample to after dropna feature: implement resampling for 3-column case Signed-off-by: F.N. Claessen * refactor: move resampling to util function Signed-off-by: F.N. Claessen * style: black Signed-off-by: F.N. Claessen --------- Signed-off-by: F.N. Claessen --- timely_beliefs/beliefs/utils.py | 42 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/timely_beliefs/beliefs/utils.py b/timely_beliefs/beliefs/utils.py index c456f80d..3269f5d7 100644 --- a/timely_beliefs/beliefs/utils.py +++ b/timely_beliefs/beliefs/utils.py @@ -652,6 +652,9 @@ def read_csv( if not kwargs.get("keep_default_na", True): df = df.dropna() + if resample: + df = resample_events(df, sensor) + # Apply optionally set belief timing if belief_horizon is not None and belief_time is not None: raise ValueError("Cannot set both a belief horizon and a belief time.") @@ -759,22 +762,6 @@ def interpret_special_read_cases( timezone_to_convert_to=sensor.timezone, timezone_to_localize_to=timezone, ) - if resample: - df = df.set_index("event_start") - if df.index.freq is None and len(df) > 2: - # Try to infer the event resolution from the event frequency - df.index.freq = pd.infer_freq(df.index) - if df.index.freq is None: - raise NotImplementedError( - "Resampling is not supported for data without a discernible frequency." - ) - if df.index.freq > sensor.event_resolution: - # Upsample by forward filling - df = df.resample(sensor.event_resolution).ffill() - else: - # Downsample by computing the mean - df = df.resample(sensor.event_resolution).mean() - df = df.reset_index() elif len(df.columns) == 3: # datetimes in 1st and 2nd column, and value in 3rd column df.columns = ["event_start", "belief_time", "event_value"] @@ -800,6 +787,29 @@ def interpret_special_read_cases( return df +def resample_events(df: pd.DataFrame, sensor: "classes.Sensor") -> pd.DataFrame: + df = df.set_index("event_start") + if df.index.freq is None and len(df) > 2: + # Try to infer the event resolution from the event frequency + df.index.freq = pd.infer_freq(df.index) + if df.index.freq is None: + raise NotImplementedError( + "Resampling is not supported for data without a discernible frequency." + ) + if df.index.freq > sensor.event_resolution: + # Upsample by forward filling + df = df.resample(sensor.event_resolution).ffill() + else: + # Downsample by computing the mean event_value and max belief_time + if "belief_time" in df.columns: + df = df.resample(sensor.event_resolution).agg( + {"event_value": np.mean, "belief_time": np.max} + ) + else: + df = df.resample(sensor.event_resolution).agg({"event_value": np.mean}) + return df.reset_index() + + def convert_to_timezone( s: pd.Series, timezone_to_convert_to: str, timezone_to_localize_to: Optional[str] ) -> pd.Series: