docs(stats-pot): create docstring for all pot method related functions

Aeternalis-Ingenium · Aeternalis-Ingenium · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023
commit 7078ea0ec9640a356f9e197712b7ca3a79c58d4e
@@ -15,24 +15,47 @@ def get_threshold_peaks_over_threshold(
     q: float = 0.90,
 ) -> pd.Series:
     """
-    Calculate the POT threshold value that will be used to extract the exceedances from `ts` dataset.
+    Calculate the Peaks Over Threshold (POT) threshold values for a given time series.
 
     ## Parameters
     -------------
     ts : pandas.Series
-        The dataset with 1 feature and datetime index to calculate the quantiles.
+        One feature dataset and a datetime index to calculate the quantiles.
 
     t0 : int
-        Time window to find dynamic expanding period for calculating quantile score.
+        Time window to find a dynamic expanding period for calculating the quantile score.
 
-    q : float
-        The quantile to use for thresholding, default 0.90.
+    anomaly_type : typing.Literal["high", "low"], default is "high"
+        Type of anomaly to detect - high or low.
+
+    q : float, default is 0.90
+        The quantile used for thresholding.
 
     ## Returns
     ----------
-    pot_thresholds : pandas.Series:
+    pd.Series
         A Pandas Series where each value is a threshold to extract the exceedances from the original dataset.
+
+    ## Example
+    ----------
+    >>> pot_threshold_df = pot_detecto.compute_exceedance_threshold(df, "high", 0.95)
+    >>> pot_threshold_df.tail()
+    Date-Time
+    2020-03-31 19:00:00    0.867
+    2020-03-31 20:00:00    0.867
+    2020-03-31 21:00:00    0.867
+    2020-03-31 22:00:00    0.867
+    2020-03-31 23:00:00    0.867
+    Name: Example Dataset, dtype: float64
+
+    ## Raises
+    ---------
+    ValueError
+        If the `anomaly_type` argument is not 'high' or 'low'.
+    TypeError
+        If the `ts` argument is not a Pandas Series.
     """
+
     logger.debug(
         f"calculating dynamic threshold for exceedance extraction using anomaly_type={anomaly_type}, t0={t0}, q={q}"
     )
@@ -58,24 +81,47 @@ def get_exceedance_peaks_over_threshold(
     q: float = 0.90,
 ) -> pd.Series:
     """
-    Extract values from the `ts` dataset that exceed the POT threshold values.
+    Extract values from the time series dataset that exceed the POT threshold values.
 
     ## Parameters
     -------------
     ts : pandas.Series
-        The dataset with 1 feature and datetime index to calculate the quantiles.
+        The dataset with one feature and a datetime index.
 
     t0 : int
-        Time window to find dynamic expanding period for calculating quantile score.
+        Time window to find a dynamic expanding period for calculating the quantile score.
+
+    anomaly_type : typing.Literal["high", "low"], default is "high"
+        Type of anomaly to detect - high or low.
 
-    q : float
-        The quantile to use for thresholding, default 0.90.
+    q : float, default is 0.90
+        The quantile used for thresholding.
 
     ## Returns
     ----------
-    exceedances : pandas.Series
+    pd.Series
         A Pandas Series with values exceeding the POT thresholds.
+
+    ## Example
+    ----------
+    >>> exceedance_df = pot_detecto.extract_exceedance(df, "high", pot_threshold_df)
+    >>> exceedance_df.tail()
+    Date-Time
+    2020-03-31 19:00:00    0.867
+    2020-03-31 20:00:00    0.867
+    2020-03-31 21:00:00    0.867
+    2020-03-31 22:00:00    0.867
+    2020-03-31 23:00:00    0.867
+    Name: Example Dataset, dtype: float64
+
+    ## Raises
+    ---------
+    ValueError
+        If the `anomaly_type` argument is not 'high' or 'low'.
+    TypeError
+        If the `ts` argument is not a Pandas Series.
     """
+
     logger.debug(f"extracting exceedances from dynamic threshold using anomaly_type={anomaly_type}, t0={t0}, q={q}")
 
     if anomaly_type not in ["high", "low"]:
@@ -99,7 +145,7 @@ def get_exceedance_peaks_over_threshold(
 
 def get_anomaly_score(ts: pd.Series, t0: int, gpd_params: typing.Dict) -> pd.Series:
     """
-    Fit exceedances into generalized pareto distribution to calculate the anomaly score.
+    Calculate the anomaly score for each data point in a time series based on the Generalized Pareto Distribution (GPD).
 
     Anomaly Score = 1 / (1 - CDF(exceedance, c, loc, scale))
 
@@ -109,16 +155,34 @@ def get_anomaly_score(ts: pd.Series, t0: int, gpd_params: typing.Dict) -> pd.Ser
         The Pandas Series that contains the exceedances.
 
     t0 : int
-        Time window to get the first day of t1 time window for dynamic window fitting.
+        Time window to get the first day of the T1 time window for dynamic window fitting.
 
-    gpd_params : dictionary
+    gpd_params : dict
         A dictionary used as the storage of the GPD parameters (fitting result).
 
     ## Returns
     ----------
-    anomaly_scores : pandas.Series
+    pd.Series
         A Pandas Series with anomaly scores (inverted p-value) as its values.
+
+    ## Example
+    ----------
+    >>> anomaly_score_df = pot_detecto.extract_exceedance(df, "high", pot_threshold_df)
+    >>> anomaly_score_df.head()
+    Date-Time
+    2016-10-29 00:00:00    0.0
+    2016-10-29 01:00:00    0.0
+    2016-10-29 02:00:00    0.0
+    2016-10-29 03:00:00    0.0
+    2016-10-29 04:00:00    0.0
+    Name: Example Dataset, dtype: float64
+
+    ## Raises
+    ---------
+    TypeError
+        If the `ts` argument is not a Pandas Series.
     """
+
     logger.debug(
         f"calculating anomaly score using t0={t0}, scipy.stats.genpareto.fit(), and scipy.stats.genpareto.sf()"
     )
@@ -177,7 +241,7 @@ def get_anomaly_score(ts: pd.Series, t0: int, gpd_params: typing.Dict) -> pd.Ser
 
 def get_anomaly_threshold(ts: pd.Series, t1: int, q: float = 0.90) -> float:
     """
-    Claculate a threshold with quantile method used for the comparison to get the anomalies.
+    Calculate a dynamic threshold based on quantiles used for comparing anomaly scores.
 
     ## Parameters
     -------------
@@ -187,14 +251,26 @@ def get_anomaly_threshold(ts: pd.Series, t1: int, q: float = 0.90) -> float:
     t1 : int
         Time window to calculate the quantile score of all anomaly scores.
 
-    q : float
-        The quantile to use for thresholding, default 0.90.
+    q : float, default is 0.90
+        The quantile used for thresholding.
 
     ## Returns
     ----------
-    anomaly_threshold : float
-        A single float serves as the threshold for anomalous data.
+    float
+        A single float value serving as the threshold for anomalous data.
+
+    ## Example
+    ----------
+    >>> anomaly_threshold = pot_detecto.compute_anomaly_threshold(anomaly_score_df, 0.90)
+    >>> print(anomaly_threshold)
+    9.167442809714414
+
+    ## Raises
+    ---------
+    TypeError
+        If the `ts` argument is not a Pandas Series.
     """
+
     logger.debug(f"calculating anomaly threshold using t1={t1}, q={q}, and `numpy.quantile()` function")
 
     if not isinstance(ts, pd.Series):
@@ -212,24 +288,42 @@ def get_anomaly_threshold(ts: pd.Series, t1: int, q: float = 0.90) -> float:
 
 def get_anomaly(ts: pd.Series, t1: int, q: float = 0.90) -> pd.Series:
     """
-    Detect the anomaloous data by comparing anoamly scores with anomaly threshold.
+    Detect anomalous data points by comparing anomaly scores with the anomaly threshold.
 
     ## Parameters
     -------------
     ts : pandas.Series
         The Pandas Series that contains the anomaly scores.
 
     t1 : int
-        Time window to calculate anomaly threshold and retrieve t2 anomaly scores.
+        Time window to calculate the anomaly threshold and retrieve T2 anomaly scores.
 
-    q : float
-        The quantile to use for thresholding, default 0.90.
+    q : float, default is 0.90
+        The quantile used for thresholding.
 
     ## Returns
     ----------
-    anomalies : pandas.Series
-        A Pandas Series that reveals which value is anomalous.
+    pd.Series
+        A Pandas Series indicating which values are anomalous.
+
+    ## Example
+    ----------
+    >>> anomaly_df = pot_detecto.detect(anomaly_score_df, anomaly_threshold_df)
+    >>> anomaly_df.head()
+    Date-Time
+    2019-02-09 08:00:00    False
+    2019-02-09 09:00:00    False
+    2019-02-09 10:00:00    False
+    2019-02-09 11:00:00    False
+    2019-02-09 12:00:00    False
+    Name: Example Dataset, dtype: bool
+
+    ## Raises
+    -------
+    TypeError
+        If the `ts` argument is not a Pandas Series.
     """
+
     logger.debug(f"detecting anomaly using t1={t1}, q={q}, and `get_anoamly_threshold()` function")
 
     if not isinstance(ts, pd.Series):

@@ -187,7 +187,7 @@ def read_ts(
     Name: Example Dataset, dtype: float64
 
     ## Raises
-    -------
+    ---------
     ValueError
         If the `path_to_file` is None or if the `file_type` is not one of the supported formats ('csv', 'xlsx').
     """

@@ -40,7 +40,7 @@ def compute_pot_windows(
     (650, 250, 100)
 
     ## Raises
-    -------
+    ---------
     ValueError
         If the percentages do not sum up correctly or if t0_pct is less than t1_pct and t2_pct.
     """

@@ -185,7 +185,7 @@ def set_time_window(  # type: ignore
     (700, 200, 100)
 
     ## Raises
-    -------
+    ---------
     NotImplementedError
         If the chosen method's specific time window setting isn't implemented yet.
     ValueError