Skip to content

Commit

Permalink
Calculate p-values when statistical metric is negative. Thanks to Ren…
Browse files Browse the repository at this point in the history
…ger Jellema.
  • Loading branch information
kgdunn committed Apr 22, 2024
1 parent e525e4c commit b016af2
Show file tree
Hide file tree
Showing 4 changed files with 235 additions and 6 deletions.
202 changes: 202 additions & 0 deletions notebooks_examples/batch/batch_llm_multivariate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
# ---
# jupyter:
# jupytext:
# cell_metadata_filter: -all
# custom_cell_magics: kql
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.2
# kernelspec:
# display_name: process-improve
# language: python
# name: python3
# ---

# %%
# !pip install git+https://github.com/SalesforceAIResearch/uni2ts


from __future__ import annotations

import os
import pathlib
import sys

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from gluonts.dataset.pandas import PandasDataset
from huggingface_hub import hf_hub_download
from uni2ts.model.moirai import MoiraiForecast

# Some basic checks, and logging set up.
while not str(pathlib.Path.cwd().name).lower().startswith("process-improve"):
os.chdir(pathlib.Path.cwd().parents[0])
basecwd = pathlib.Path.cwd()
sys.path.insert(0, str(basecwd))
assert basecwd.exists()

import process_improve.datasets.batch as batch_ds
from process_improve.batch.data_input import melted_to_dict
from process_improve.batch.plotting import plot_all_batches_per_tag

pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 20
pd.options.display.width = 200

device_map = "cpu" # use "cpu" for CPU inference and "mps" for Apple Silicon; or "cuda"

# Ideally, use more than 1 tag to align on. These columns must exist in all data frames for all batches.
columns_to_import = ["AgitatorPower", "AgitatorTorque", "JacketTemperature", "DryerTemp"]
tag_to_plot = columns_to_import[3]


TIME_COL = "Date"
TARGET = "visits"
DYNAMIC_COV = ["CPI", "Inflation_Rate", "GDP"]
SEAS_COV = [
"month_1",
"month_2",
"month_3",
"month_4",
"month_5",
"month_6",
"month_7",
"month_8",
"month_9",
"month_10",
"month_11",
"month_12",
]
FORECAST_HORIZON = 8 # months
FREQ = "M"


# %%
def preprocess_dataset(dataframe: pd.DataFrame, dynamic_cov: list, time_col: str, target: str) -> pd.DataFrame:
"""
Receives the raw dataframe and create.
- unique id column
- make dynamic start dates for each series based on the first date where visits is different than 0
Args:
dataframe (pd.DataFrame): raw data
dynamic_cov (list): column names with dynamic cov
time_col (str): time column name
target (str): target name
Returns:
pd.DataFrame: cleaned data
"""

# save dynamic cov for later
dynamic_cov_df = dataframe[dynamic_cov].reset_index().drop_duplicates()

# create target and unique id columns
dataframe = (
dataframe.loc[:, ~dataframe.columns.isin(dynamic_cov)]
.melt(ignore_index=False)
.reset_index()
.rename(columns={"variable": "unique_id", "value": target})
)

# crete dynamic start dates for each series
cleaned_df = []
for i in dataframe["unique_id"].unique():
temp = dataframe[dataframe["unique_id"] == i]
cleaned_df.append(temp[temp[time_col] >= min(temp[temp[target] > 0][time_col])])
cleaned_df = pd.concat(cleaned_df)

# join dynamic cov
cleaned_df = pd.merge(cleaned_df, dynamic_cov_df, on=[time_col], how="left")

return cleaned_df


def moirai_forecast_to_pandas(forecast, test_df: pd.DataFrame, forecast_horizon: int, time_col: str) -> pd.DataFrame:
"""
Convert MOIRAI forecast into pandas dataframe.
Args:
forecast: MOIRAI's forecast
test_df: dataframe with actuals
forecast_horizon: forecast horizon
time_col: date column
Returns:
pd.DataFrame: forecast in pandas format
"""

d = {"unique_id": [], time_col: [], "forecast": [], "forecast_lower": [], "forecast_upper": []}

for ts in forecast:
for j in range(forecast_horizon):
d["unique_id"].append(ts.item_id)
d[time_col].append(test_df[test_df["unique_id"] == ts.item_id][time_col].tolist()[j])

temp = [sample[j] for sample in ts.samples]

d["forecast"].append(np.median(temp))
d["forecast_lower"].append(np.percentile(temp, 10))
d["forecast_upper"].append(np.percentile(temp, 90))

return pd.DataFrame(d)


# load data and exogenous features
df = pd.DataFrame(load_dataset("zaai-ai/time_series_datasets", data_files={"train": "data.csv"})["train"]).drop(
columns=["Unnamed: 0"]
)
df[TIME_COL] = pd.to_datetime(df[TIME_COL])

# one hot encode month
df["month"] = df[TIME_COL].dt.month
df = pd.get_dummies(df, columns=["month"], dtype=int)

print(f"Distinct number of time series: {len(df['unique_id'].unique())}")
df.head()

# 8 months to test
train = df[df[TIME_COL] <= (max(df[TIME_COL]) - pd.DateOffset(months=FORECAST_HORIZON))]
test = df[df[TIME_COL] > (max(df[TIME_COL]) - pd.DateOffset(months=FORECAST_HORIZON))]

print(
f"Months for training: {len(train[TIME_COL].unique())} from {min(train[TIME_COL]).date()} to {max(train[TIME_COL]).date()}"
)
print(
f"Months for testing: {len(test[TIME_COL].unique())} from {min(test[TIME_COL]).date()} to {max(test[TIME_COL]).date()}"
)


# create GluonTS dataset from pandas
ds = PandasDataset.from_long_dataframe(
pd.concat([train, test[["unique_id", TIME_COL] + DYNAMIC_COV + SEAS_COV]]).set_index(
TIME_COL
), # concatenaation with test dynamic covaraiates
item_id="unique_id",
feat_dynamic_real=DYNAMIC_COV + SEAS_COV,
target=TARGET,
freq=FREQ,
)


# Prepare pre-trained model by downloading model weights from huggingface hub
model = MoiraiForecast.load_from_checkpoint(
checkpoint_path=hf_hub_download(repo_id="Salesforce/moirai-R-large", filename="model.ckpt"),
prediction_length=FORECAST_HORIZON,
context_length=24,
patch_size="auto",
num_samples=100,
target_dim=1,
feat_dynamic_real_dim=ds.num_feat_dynamic_real,
past_feat_dynamic_real_dim=ds.num_past_feat_dynamic_real,
map_location="cuda:0" if torch.cuda.is_available() else "cpu",
)

predictor = model.create_predictor(batch_size=32)
forecasts = predictor.predict(ds)

# convert forecast into pandas
forecast_df = moirai_forecast_to_pandas(forecasts, test, FORECAST_HORIZON, TIME_COL)
2 changes: 1 addition & 1 deletion process_improve/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""(c) Kevin Dunn, 2010-2024. MIT License."""

__version__ = "0.9.70"
__version__ = "0.9.71"
7 changes: 2 additions & 5 deletions process_improve/univariate/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,11 +757,8 @@ def outlier_detection_multiple(
R_i_idx = R.idxmax()
g = R.max()
# Formula from the R-function for the Grubb's calculation
t = np.sqrt((g**2 * N * (2 - N)) / (g**2 * N - (N - 1) ** 2))
if t <= 0:
p_value = 0
else:
p_value = min(N * (1 - t_value_cdf(t, N - 2)), 1)
s = g**2 * N * (2 - N) / (g**2 * N - (N - 1) ** 2)
p_value = 0 if s <= 0 else min(N * (1 - t_value_cdf(np.sqrt(s), N - 2)), 1)

extra_out["R_i_idx"].append(R_i_idx)
extra_out["R_i"].append(R.max())
Expand Down
30 changes: 30 additions & 0 deletions tests/test_univariate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -545,6 +547,34 @@ def test_within_between_sd_missing_values():
assert out["between_dof"] == 0


@pytest.fixture()
def outliers_data_measurement() -> list[float]:
"""From an actual use-case."""
return [
10769.166,
10043.447,
10171.783,
10751.362,
10684.675,
10772.250,
10830.804,
]


def test_edge_case_outliers(outliers_data_measurement) -> None:
"""Test an actual edge case that did not return p-values."""
max_outliers = len(outliers_data_measurement) - 5
outliers, reasons = univariate.outlier_detection_multiple(
outliers_data_measurement, algoritm="esd", max_outliers_detected=max_outliers, alpha=0.05
)

assert len(outliers) == 1
assert reasons["lambda"][0] == pytest.approx(2.0199685076)
assert reasons["R_i"][0] == pytest.approx(7.16003736)
assert reasons["p-value"][0] == 0
assert reasons["p-value"][1] == 0


@pytest.fixture()
def outliers_data():
# Rosner data set: https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
Expand Down

0 comments on commit b016af2

Please sign in to comment.