Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs(docstring-readme): finish docstring and write readme #16

Merged
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
docs(stats-pot): create docstring for all pot method related functions
ninopleno committed Dec 4, 2023
commit 7078ea0ec9640a356f9e197712b7ca3a79c58d4e
148 changes: 121 additions & 27 deletions src/anomalytics/stats/peaks_over_threshold.py
Original file line number Diff line number Diff line change
@@ -15,24 +15,47 @@ def get_threshold_peaks_over_threshold(
q: float = 0.90,
) -> pd.Series:
"""
Calculate the POT threshold value that will be used to extract the exceedances from `ts` dataset.
Calculate the Peaks Over Threshold (POT) threshold values for a given time series.

## Parameters
-------------
ts : pandas.Series
The dataset with 1 feature and datetime index to calculate the quantiles.
One feature dataset and a datetime index to calculate the quantiles.

t0 : int
Time window to find dynamic expanding period for calculating quantile score.
Time window to find a dynamic expanding period for calculating the quantile score.

q : float
The quantile to use for thresholding, default 0.90.
anomaly_type : typing.Literal["high", "low"], default is "high"
Type of anomaly to detect - high or low.

q : float, default is 0.90
The quantile used for thresholding.

## Returns
----------
pot_thresholds : pandas.Series:
pd.Series
A Pandas Series where each value is a threshold to extract the exceedances from the original dataset.

## Example
----------
>>> pot_threshold_df = pot_detecto.compute_exceedance_threshold(df, "high", 0.95)
>>> pot_threshold_df.tail()
Date-Time
2020-03-31 19:00:00 0.867
2020-03-31 20:00:00 0.867
2020-03-31 21:00:00 0.867
2020-03-31 22:00:00 0.867
2020-03-31 23:00:00 0.867
Name: Example Dataset, dtype: float64

## Raises
---------
ValueError
If the `anomaly_type` argument is not 'high' or 'low'.
TypeError
If the `ts` argument is not a Pandas Series.
"""

logger.debug(
f"calculating dynamic threshold for exceedance extraction using anomaly_type={anomaly_type}, t0={t0}, q={q}"
)
@@ -58,24 +81,47 @@ def get_exceedance_peaks_over_threshold(
q: float = 0.90,
) -> pd.Series:
"""
Extract values from the `ts` dataset that exceed the POT threshold values.
Extract values from the time series dataset that exceed the POT threshold values.

## Parameters
-------------
ts : pandas.Series
The dataset with 1 feature and datetime index to calculate the quantiles.
The dataset with one feature and a datetime index.

t0 : int
Time window to find dynamic expanding period for calculating quantile score.
Time window to find a dynamic expanding period for calculating the quantile score.

anomaly_type : typing.Literal["high", "low"], default is "high"
Type of anomaly to detect - high or low.

q : float
The quantile to use for thresholding, default 0.90.
q : float, default is 0.90
The quantile used for thresholding.

## Returns
----------
exceedances : pandas.Series
pd.Series
A Pandas Series with values exceeding the POT thresholds.

## Example
----------
>>> exceedance_df = pot_detecto.extract_exceedance(df, "high", pot_threshold_df)
>>> exceedance_df.tail()
Date-Time
2020-03-31 19:00:00 0.867
2020-03-31 20:00:00 0.867
2020-03-31 21:00:00 0.867
2020-03-31 22:00:00 0.867
2020-03-31 23:00:00 0.867
Name: Example Dataset, dtype: float64

## Raises
---------
ValueError
If the `anomaly_type` argument is not 'high' or 'low'.
TypeError
If the `ts` argument is not a Pandas Series.
"""

logger.debug(f"extracting exceedances from dynamic threshold using anomaly_type={anomaly_type}, t0={t0}, q={q}")

if anomaly_type not in ["high", "low"]:
@@ -99,7 +145,7 @@ def get_exceedance_peaks_over_threshold(

def get_anomaly_score(ts: pd.Series, t0: int, gpd_params: typing.Dict) -> pd.Series:
"""
Fit exceedances into generalized pareto distribution to calculate the anomaly score.
Calculate the anomaly score for each data point in a time series based on the Generalized Pareto Distribution (GPD).

Anomaly Score = 1 / (1 - CDF(exceedance, c, loc, scale))

@@ -109,16 +155,34 @@ def get_anomaly_score(ts: pd.Series, t0: int, gpd_params: typing.Dict) -> pd.Ser
The Pandas Series that contains the exceedances.

t0 : int
Time window to get the first day of t1 time window for dynamic window fitting.
Time window to get the first day of the T1 time window for dynamic window fitting.

gpd_params : dictionary
gpd_params : dict
A dictionary used as the storage of the GPD parameters (fitting result).

## Returns
----------
anomaly_scores : pandas.Series
pd.Series
A Pandas Series with anomaly scores (inverted p-value) as its values.

## Example
----------
>>> anomaly_score_df = pot_detecto.extract_exceedance(df, "high", pot_threshold_df)
>>> anomaly_score_df.head()
Date-Time
2016-10-29 00:00:00 0.0
2016-10-29 01:00:00 0.0
2016-10-29 02:00:00 0.0
2016-10-29 03:00:00 0.0
2016-10-29 04:00:00 0.0
Name: Example Dataset, dtype: float64

## Raises
---------
TypeError
If the `ts` argument is not a Pandas Series.
"""

logger.debug(
f"calculating anomaly score using t0={t0}, scipy.stats.genpareto.fit(), and scipy.stats.genpareto.sf()"
)
@@ -177,7 +241,7 @@ def get_anomaly_score(ts: pd.Series, t0: int, gpd_params: typing.Dict) -> pd.Ser

def get_anomaly_threshold(ts: pd.Series, t1: int, q: float = 0.90) -> float:
"""
Claculate a threshold with quantile method used for the comparison to get the anomalies.
Calculate a dynamic threshold based on quantiles used for comparing anomaly scores.

## Parameters
-------------
@@ -187,14 +251,26 @@ def get_anomaly_threshold(ts: pd.Series, t1: int, q: float = 0.90) -> float:
t1 : int
Time window to calculate the quantile score of all anomaly scores.

q : float
The quantile to use for thresholding, default 0.90.
q : float, default is 0.90
The quantile used for thresholding.

## Returns
----------
anomaly_threshold : float
A single float serves as the threshold for anomalous data.
float
A single float value serving as the threshold for anomalous data.

## Example
----------
>>> anomaly_threshold = pot_detecto.compute_anomaly_threshold(anomaly_score_df, 0.90)
>>> print(anomaly_threshold)
9.167442809714414

## Raises
---------
TypeError
If the `ts` argument is not a Pandas Series.
"""

logger.debug(f"calculating anomaly threshold using t1={t1}, q={q}, and `numpy.quantile()` function")

if not isinstance(ts, pd.Series):
@@ -212,24 +288,42 @@ def get_anomaly_threshold(ts: pd.Series, t1: int, q: float = 0.90) -> float:

def get_anomaly(ts: pd.Series, t1: int, q: float = 0.90) -> pd.Series:
"""
Detect the anomaloous data by comparing anoamly scores with anomaly threshold.
Detect anomalous data points by comparing anomaly scores with the anomaly threshold.

## Parameters
-------------
ts : pandas.Series
The Pandas Series that contains the anomaly scores.

t1 : int
Time window to calculate anomaly threshold and retrieve t2 anomaly scores.
Time window to calculate the anomaly threshold and retrieve T2 anomaly scores.

q : float
The quantile to use for thresholding, default 0.90.
q : float, default is 0.90
The quantile used for thresholding.

## Returns
----------
anomalies : pandas.Series
A Pandas Series that reveals which value is anomalous.
pd.Series
A Pandas Series indicating which values are anomalous.

## Example
----------
>>> anomaly_df = pot_detecto.detect(anomaly_score_df, anomaly_threshold_df)
>>> anomaly_df.head()
Date-Time
2019-02-09 08:00:00 False
2019-02-09 09:00:00 False
2019-02-09 10:00:00 False
2019-02-09 11:00:00 False
2019-02-09 12:00:00 False
Name: Example Dataset, dtype: bool

## Raises
-------
TypeError
If the `ts` argument is not a Pandas Series.
"""

logger.debug(f"detecting anomaly using t1={t1}, q={q}, and `get_anoamly_threshold()` function")

if not isinstance(ts, pd.Series):
2 changes: 1 addition & 1 deletion src/anomalytics/time_series/upload.py
Original file line number Diff line number Diff line change
@@ -187,7 +187,7 @@ def read_ts(
Name: Example Dataset, dtype: float64

## Raises
-------
---------
ValueError
If the `path_to_file` is None or if the `file_type` is not one of the supported formats ('csv', 'xlsx').
"""
2 changes: 1 addition & 1 deletion src/anomalytics/time_windows/pot_window.py
Original file line number Diff line number Diff line change
@@ -40,7 +40,7 @@ def compute_pot_windows(
(650, 250, 100)

## Raises
-------
---------
ValueError
If the percentages do not sum up correctly or if t0_pct is less than t1_pct and t2_pct.
"""
2 changes: 1 addition & 1 deletion src/anomalytics/time_windows/time_window.py
Original file line number Diff line number Diff line change
@@ -185,7 +185,7 @@ def set_time_window( # type: ignore
(700, 200, 100)

## Raises
-------
---------
NotImplementedError
If the chosen method's specific time window setting isn't implemented yet.
ValueError