Calculate p-values when statistical metric is negative. Thanks to Ren…

…ger Jellema.
kgdunn · Apr 22, 2024 · b016af2 · b016af2
1 parent e525e4c
commit b016af2
Show file tree

Hide file tree

Showing 4 changed files with 235 additions and 6 deletions.
diff --git a/notebooks_examples/batch/batch_llm_multivariate.py b/notebooks_examples/batch/batch_llm_multivariate.py
@@ -0,0 +1,202 @@
+# ---
+# jupyter:
+#   jupytext:
+#     cell_metadata_filter: -all
+#     custom_cell_magics: kql
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.11.2
+#   kernelspec:
+#     display_name: process-improve
+#     language: python
+#     name: python3
+# ---
+
+# %%
+# !pip install git+https://github.com/SalesforceAIResearch/uni2ts
+
+
+from __future__ import annotations
+
+import os
+import pathlib
+import sys
+
+import numpy as np
+import pandas as pd
+import torch
+from datasets import load_dataset
+from gluonts.dataset.pandas import PandasDataset
+from huggingface_hub import hf_hub_download
+from uni2ts.model.moirai import MoiraiForecast
+
+# Some basic checks, and logging set up.
+while not str(pathlib.Path.cwd().name).lower().startswith("process-improve"):
+    os.chdir(pathlib.Path.cwd().parents[0])
+basecwd = pathlib.Path.cwd()
+sys.path.insert(0, str(basecwd))
+assert basecwd.exists()
+
+import process_improve.datasets.batch as batch_ds
+from process_improve.batch.data_input import melted_to_dict
+from process_improve.batch.plotting import plot_all_batches_per_tag
+
+pd.options.plotting.backend = "plotly"
+pd.options.display.max_columns = 20
+pd.options.display.width = 200
+
+device_map = "cpu"  # use "cpu" for CPU inference and "mps" for Apple Silicon; or "cuda"
+
+# Ideally, use more than 1 tag to align on. These columns must exist in all data frames for all batches.
+columns_to_import = ["AgitatorPower", "AgitatorTorque", "JacketTemperature", "DryerTemp"]
+tag_to_plot = columns_to_import[3]
+
+
+TIME_COL = "Date"
+TARGET = "visits"
+DYNAMIC_COV = ["CPI", "Inflation_Rate", "GDP"]
+SEAS_COV = [
+    "month_1",
+    "month_2",
+    "month_3",
+    "month_4",
+    "month_5",
+    "month_6",
+    "month_7",
+    "month_8",
+    "month_9",
+    "month_10",
+    "month_11",
+    "month_12",
+]
+FORECAST_HORIZON = 8  # months
+FREQ = "M"
+
+
+# %%
+def preprocess_dataset(dataframe: pd.DataFrame, dynamic_cov: list, time_col: str, target: str) -> pd.DataFrame:
+    """
+    Receives the raw dataframe and create.
+
+        - unique id column
+        - make dynamic start dates for each series based on the first date where visits is different than 0
+
+    Args:
+        dataframe (pd.DataFrame): raw data
+        dynamic_cov (list): column names with dynamic cov
+        time_col (str): time column name
+        target (str): target name
+    Returns:
+        pd.DataFrame: cleaned data
+    """
+
+    # save dynamic cov for later
+    dynamic_cov_df = dataframe[dynamic_cov].reset_index().drop_duplicates()
+
+    # create target and unique id columns
+    dataframe = (
+        dataframe.loc[:, ~dataframe.columns.isin(dynamic_cov)]
+        .melt(ignore_index=False)
+        .reset_index()
+        .rename(columns={"variable": "unique_id", "value": target})
+    )
+
+    # crete dynamic start dates for each series
+    cleaned_df = []
+    for i in dataframe["unique_id"].unique():
+        temp = dataframe[dataframe["unique_id"] == i]
+        cleaned_df.append(temp[temp[time_col] >= min(temp[temp[target] > 0][time_col])])
+    cleaned_df = pd.concat(cleaned_df)
+
+    # join dynamic cov
+    cleaned_df = pd.merge(cleaned_df, dynamic_cov_df, on=[time_col], how="left")
+
+    return cleaned_df
+
+
+def moirai_forecast_to_pandas(forecast, test_df: pd.DataFrame, forecast_horizon: int, time_col: str) -> pd.DataFrame:
+    """
+    Convert MOIRAI forecast into pandas dataframe.
+
+    Args:
+        forecast: MOIRAI's forecast
+        test_df: dataframe with actuals
+        forecast_horizon: forecast horizon
+        time_col: date column
+    Returns:
+        pd.DataFrame: forecast in pandas format
+    """
+
+    d = {"unique_id": [], time_col: [], "forecast": [], "forecast_lower": [], "forecast_upper": []}
+
+    for ts in forecast:
+        for j in range(forecast_horizon):
+            d["unique_id"].append(ts.item_id)
+            d[time_col].append(test_df[test_df["unique_id"] == ts.item_id][time_col].tolist()[j])
+
+            temp = [sample[j] for sample in ts.samples]
+
+            d["forecast"].append(np.median(temp))
+            d["forecast_lower"].append(np.percentile(temp, 10))
+            d["forecast_upper"].append(np.percentile(temp, 90))
+
+    return pd.DataFrame(d)
+
+
+# load data and exogenous features
+df = pd.DataFrame(load_dataset("zaai-ai/time_series_datasets", data_files={"train": "data.csv"})["train"]).drop(
+    columns=["Unnamed: 0"]
+)
+df[TIME_COL] = pd.to_datetime(df[TIME_COL])
+
+# one hot encode month
+df["month"] = df[TIME_COL].dt.month
+df = pd.get_dummies(df, columns=["month"], dtype=int)
+
+print(f"Distinct number of time series: {len(df['unique_id'].unique())}")
+df.head()
+
+# 8 months to test
+train = df[df[TIME_COL] <= (max(df[TIME_COL]) - pd.DateOffset(months=FORECAST_HORIZON))]
+test = df[df[TIME_COL] > (max(df[TIME_COL]) - pd.DateOffset(months=FORECAST_HORIZON))]
+
+print(
+    f"Months for training: {len(train[TIME_COL].unique())} from {min(train[TIME_COL]).date()} to {max(train[TIME_COL]).date()}"
+)
+print(
+    f"Months for testing: {len(test[TIME_COL].unique())} from {min(test[TIME_COL]).date()} to {max(test[TIME_COL]).date()}"
+)
+
+
+# create GluonTS dataset from pandas
+ds = PandasDataset.from_long_dataframe(
+    pd.concat([train, test[["unique_id", TIME_COL] + DYNAMIC_COV + SEAS_COV]]).set_index(
+        TIME_COL
+    ),  # concatenaation with test dynamic covaraiates
+    item_id="unique_id",
+    feat_dynamic_real=DYNAMIC_COV + SEAS_COV,
+    target=TARGET,
+    freq=FREQ,
+)
+
+
+# Prepare pre-trained model by downloading model weights from huggingface hub
+model = MoiraiForecast.load_from_checkpoint(
+    checkpoint_path=hf_hub_download(repo_id="Salesforce/moirai-R-large", filename="model.ckpt"),
+    prediction_length=FORECAST_HORIZON,
+    context_length=24,
+    patch_size="auto",
+    num_samples=100,
+    target_dim=1,
+    feat_dynamic_real_dim=ds.num_feat_dynamic_real,
+    past_feat_dynamic_real_dim=ds.num_past_feat_dynamic_real,
+    map_location="cuda:0" if torch.cuda.is_available() else "cpu",
+)
+
+predictor = model.create_predictor(batch_size=32)
+forecasts = predictor.predict(ds)
+
+# convert forecast into pandas
+forecast_df = moirai_forecast_to_pandas(forecasts, test, FORECAST_HORIZON, TIME_COL)
diff --git a/process_improve/__init__.py b/process_improve/__init__.py
@@ -1,3 +1,3 @@
 """(c) Kevin Dunn, 2010-2024. MIT License."""
 
-__version__ = "0.9.70"
+__version__ = "0.9.71"
diff --git a/process_improve/univariate/metrics.py b/process_improve/univariate/metrics.py
@@ -757,11 +757,8 @@ def outlier_detection_multiple(
             R_i_idx = R.idxmax()
             g = R.max()
             # Formula from the R-function for the Grubb's calculation
-            t = np.sqrt((g**2 * N * (2 - N)) / (g**2 * N - (N - 1) ** 2))
-            if t <= 0:
-                p_value = 0
-            else:
-                p_value = min(N * (1 - t_value_cdf(t, N - 2)), 1)
+            s = g**2 * N * (2 - N) / (g**2 * N - (N - 1) ** 2)
+            p_value = 0 if s <= 0 else min(N * (1 - t_value_cdf(np.sqrt(s), N - 2)), 1)
 
             extra_out["R_i_idx"].append(R_i_idx)
             extra_out["R_i"].append(R.max())

diff --git a/tests/test_univariate.py b/tests/test_univariate.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -545,6 +547,34 @@ def test_within_between_sd_missing_values():
     assert out["between_dof"] == 0
 
 
+@pytest.fixture()
+def outliers_data_measurement() -> list[float]:
+    """From an actual use-case."""
+    return [
+        10769.166,
+        10043.447,
+        10171.783,
+        10751.362,
+        10684.675,
+        10772.250,
+        10830.804,
+    ]
+
+
+def test_edge_case_outliers(outliers_data_measurement) -> None:
+    """Test an actual edge case that did not return p-values."""
+    max_outliers = len(outliers_data_measurement) - 5
+    outliers, reasons = univariate.outlier_detection_multiple(
+        outliers_data_measurement, algoritm="esd", max_outliers_detected=max_outliers, alpha=0.05
+    )
+
+    assert len(outliers) == 1
+    assert reasons["lambda"][0] == pytest.approx(2.0199685076)
+    assert reasons["R_i"][0] == pytest.approx(7.16003736)
+    assert reasons["p-value"][0] == 0
+    assert reasons["p-value"][1] == 0
+
+
 @pytest.fixture()
 def outliers_data():
     # Rosner data set: https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm