From bdbcba183bf05c275fdf8bb575ea9d9d2aeb9566 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Sat, 4 Nov 2023 15:37:02 +0100 Subject: [PATCH 1/5] fix: move resample to after dropna feature: implement resampling for 3-column case Signed-off-by: F.N. Claessen --- timely_beliefs/beliefs/utils.py | 36 ++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/timely_beliefs/beliefs/utils.py b/timely_beliefs/beliefs/utils.py index c456f80d..affd2e7e 100644 --- a/timely_beliefs/beliefs/utils.py +++ b/timely_beliefs/beliefs/utils.py @@ -652,6 +652,26 @@ def read_csv( if not kwargs.get("keep_default_na", True): df = df.dropna() + if resample: + df = df.set_index("event_start") + if df.index.freq is None and len(df) > 2: + # Try to infer the event resolution from the event frequency + df.index.freq = pd.infer_freq(df.index) + if df.index.freq is None: + raise NotImplementedError( + "Resampling is not supported for data without a discernible frequency." + ) + if df.index.freq > sensor.event_resolution: + # Upsample by forward filling + df = df.resample(sensor.event_resolution).ffill() + else: + # Downsample by computing the mean event_value and max belief_time + if "belief_time" in df.columns: + df = df.resample(sensor.event_resolution).agg({"event_value": np.mean, "belief_time": np.max}) + else: + df = df.resample(sensor.event_resolution).agg({"event_value": np.mean}) + df = df.reset_index() + # Apply optionally set belief timing if belief_horizon is not None and belief_time is not None: raise ValueError("Cannot set both a belief horizon and a belief time.") @@ -759,22 +779,6 @@ def interpret_special_read_cases( timezone_to_convert_to=sensor.timezone, timezone_to_localize_to=timezone, ) - if resample: - df = df.set_index("event_start") - if df.index.freq is None and len(df) > 2: - # Try to infer the event resolution from the event frequency - df.index.freq = pd.infer_freq(df.index) - if df.index.freq is None: - raise NotImplementedError( - "Resampling is not supported for data without a discernible frequency." - ) - if df.index.freq > sensor.event_resolution: - # Upsample by forward filling - df = df.resample(sensor.event_resolution).ffill() - else: - # Downsample by computing the mean - df = df.resample(sensor.event_resolution).mean() - df = df.reset_index() elif len(df.columns) == 3: # datetimes in 1st and 2nd column, and value in 3rd column df.columns = ["event_start", "belief_time", "event_value"] From 8e50d2f3cb511dda2b9f475699f16159a2d22b17 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Sat, 4 Nov 2023 15:40:09 +0100 Subject: [PATCH 2/5] refactor: move resampling to util function Signed-off-by: F.N. Claessen --- timely_beliefs/beliefs/utils.py | 40 ++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/timely_beliefs/beliefs/utils.py b/timely_beliefs/beliefs/utils.py index affd2e7e..6e8d97b4 100644 --- a/timely_beliefs/beliefs/utils.py +++ b/timely_beliefs/beliefs/utils.py @@ -653,24 +653,7 @@ def read_csv( df = df.dropna() if resample: - df = df.set_index("event_start") - if df.index.freq is None and len(df) > 2: - # Try to infer the event resolution from the event frequency - df.index.freq = pd.infer_freq(df.index) - if df.index.freq is None: - raise NotImplementedError( - "Resampling is not supported for data without a discernible frequency." - ) - if df.index.freq > sensor.event_resolution: - # Upsample by forward filling - df = df.resample(sensor.event_resolution).ffill() - else: - # Downsample by computing the mean event_value and max belief_time - if "belief_time" in df.columns: - df = df.resample(sensor.event_resolution).agg({"event_value": np.mean, "belief_time": np.max}) - else: - df = df.resample(sensor.event_resolution).agg({"event_value": np.mean}) - df = df.reset_index() + df = resample_events(df, sensor) # Apply optionally set belief timing if belief_horizon is not None and belief_time is not None: @@ -804,6 +787,27 @@ def interpret_special_read_cases( return df +def resample_events(df: pd.DataFrame, sensor: "classes.Sensor") -> pd.DataFrame: + df = df.set_index("event_start") + if df.index.freq is None and len(df) > 2: + # Try to infer the event resolution from the event frequency + df.index.freq = pd.infer_freq(df.index) + if df.index.freq is None: + raise NotImplementedError( + "Resampling is not supported for data without a discernible frequency." + ) + if df.index.freq > sensor.event_resolution: + # Upsample by forward filling + df = df.resample(sensor.event_resolution).ffill() + else: + # Downsample by computing the mean event_value and max belief_time + if "belief_time" in df.columns: + df = df.resample(sensor.event_resolution).agg({"event_value": np.mean, "belief_time": np.max}) + else: + df = df.resample(sensor.event_resolution).agg({"event_value": np.mean}) + return df.reset_index() + + def convert_to_timezone( s: pd.Series, timezone_to_convert_to: str, timezone_to_localize_to: Optional[str] ) -> pd.Series: From 6d5313ec0500d4eb2e620481a818c9300d422605 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Sat, 4 Nov 2023 15:40:50 +0100 Subject: [PATCH 3/5] style: black Signed-off-by: F.N. Claessen --- timely_beliefs/beliefs/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/timely_beliefs/beliefs/utils.py b/timely_beliefs/beliefs/utils.py index 6e8d97b4..3269f5d7 100644 --- a/timely_beliefs/beliefs/utils.py +++ b/timely_beliefs/beliefs/utils.py @@ -802,7 +802,9 @@ def resample_events(df: pd.DataFrame, sensor: "classes.Sensor") -> pd.DataFrame: else: # Downsample by computing the mean event_value and max belief_time if "belief_time" in df.columns: - df = df.resample(sensor.event_resolution).agg({"event_value": np.mean, "belief_time": np.max}) + df = df.resample(sensor.event_resolution).agg( + {"event_value": np.mean, "belief_time": np.max} + ) else: df = df.resample(sensor.event_resolution).agg({"event_value": np.mean}) return df.reset_index() From 019edb48e2b2a3117906e45d14cf3f19043e2cd0 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Sat, 4 Nov 2023 15:43:51 +0100 Subject: [PATCH 4/5] feature: filter by time window when reading from csv Signed-off-by: F.N. Claessen --- timely_beliefs/beliefs/utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/timely_beliefs/beliefs/utils.py b/timely_beliefs/beliefs/utils.py index 3269f5d7..a9c8608c 100644 --- a/timely_beliefs/beliefs/utils.py +++ b/timely_beliefs/beliefs/utils.py @@ -535,6 +535,8 @@ def read_csv( resample: bool = False, timezone: Optional[str] = None, filter_by_column: dict = None, + event_ends_after: datetime = None, + event_starts_before: datetime = None, datetime_column_split: str | None = None, transformations: list[dict] = None, **kwargs, @@ -561,6 +563,12 @@ def read_csv( If not set and timezone naive datetimes are read in, the data is localized to UTC. :param filter_by_column: Select a subset of rows by filtering on a specific value for a specific column. For example: {4: 1995} selects all rows where column 4 contains the value 1995. + :param event_ends_after: Optionally, keep only events that end after this datetime. + Exclusive for non-instantaneous events, inclusive for instantaneous events. + Note that the first event may transpire partially before this datetime. + :param event_starts_before: Optionally, keep only events that start before this datetime. + Exclusive for non-instantaneous events, inclusive for instantaneous events. + Note that the last event may transpire partially after this datetime. :param datetime_column_split: Optionally, help parse the datetime column by splitting according to some string. For example: "1 jan 2022 00:00 - 1 jan 2022 01:00" @@ -652,6 +660,17 @@ def read_csv( if not kwargs.get("keep_default_na", True): df = df.dropna() + if event_ends_after: + if sensor.event_resolution == timedelta(0): + df = df[df["event_start"] + sensor.event_resolution >= event_ends_after] + else: + df = df[df["event_start"] + sensor.event_resolution > event_ends_after] + if event_starts_before: + if sensor.event_resolution == timedelta(0): + df = df[df["event_start"] <= event_starts_before] + else: + df = df[df["event_start"] < event_starts_before] + if resample: df = resample_events(df, sensor) From 13f7cec220b94dfc96be9684212248e16fb0b271 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Sat, 4 Nov 2023 16:11:44 +0100 Subject: [PATCH 5/5] style: flake8 Signed-off-by: F.N. Claessen --- timely_beliefs/beliefs/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timely_beliefs/beliefs/utils.py b/timely_beliefs/beliefs/utils.py index a9c8608c..125c1449 100644 --- a/timely_beliefs/beliefs/utils.py +++ b/timely_beliefs/beliefs/utils.py @@ -524,7 +524,7 @@ def set_reference( ) -def read_csv( +def read_csv( # noqa C901 path: str, sensor: "classes.Sensor", source: "classes.BeliefSource" = None,