From 7a207a206a8e82d6edfe6ad2521bd990a78d6a4b Mon Sep 17 00:00:00 2001 From: Zack Date: Tue, 5 Nov 2024 13:33:52 -0500 Subject: [PATCH] migrate rp calcs/exploratory implementation code --- exploration/03_check_return_period_funcs.py | 241 ++++++++++++++++++++ requirements.txt | 3 + src/utils/return_periods.py | 128 +++++++++++ 3 files changed, 372 insertions(+) create mode 100644 exploration/03_check_return_period_funcs.py create mode 100644 src/utils/return_periods.py diff --git a/exploration/03_check_return_period_funcs.py b/exploration/03_check_return_period_funcs.py new file mode 100644 index 0000000..ac3d75d --- /dev/null +++ b/exploration/03_check_return_period_funcs.py @@ -0,0 +1,241 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: -all +# custom_cell_magics: kql +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.11.2 +# kernelspec: +# display_name: ds-raster-stats +# language: python +# name: ds-raster-stats +# --- + +# %% [markdown] +# This script implements the src.utils return_period function for some test +# data we have already created for somalia in teh hdx-ingest repo. +# it's basically just some sanity checks to make sure the functions work +# and qualitatively look as expected. + +# %% +# %matplotlib inline +# %load_ext autoreload +# %autoreload 2 + +import re +from io import BytesIO + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from matplotlib.ticker import FuncFormatter + +from src.utils import cloud_utils +from src.utils import return_periods as rp + + +# %% +def to_snake_case(name): + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() + + +sample_admin = "Awdal" # just used for plotting checking +# %% [markdown] +# Load data + +# %% +pc = cloud_utils.get_container_client(mode="dev", container_name="projects") +blob_name = "ds-floodscan-ingest/df_aer_sfed_som_adm1_zstats.parquet" +blob_client = pc.get_blob_client(blob_name) +blob_data = blob_client.download_blob().readall() +df = pd.read_parquet(BytesIO(blob_data)) +df.columns = [to_snake_case(col) for col in df.columns] + +# %% [markdown] +# ## Pre Process Data +# +# So just take yearly max values + +# %% +df["year"] = pd.to_datetime(df["date"]).dt.year +df_max = df.groupby(["adm1_en", "adm1_pcode", "year"]).max().reset_index() + +# %% [markdown] +# ## Calculate LP3 Params +# +# Show the implementation by first just doing each method one-by-one on a +# single sample admin + +# %% +rp.lp3_params( + df_max[df_max["adm1_en"] == sample_admin].value, est_method="usgs" +) + + +# %% +rp.lp3_params( + df_max[df_max["adm1_en"] == sample_admin].value, est_method="lmoments" +) + +# %% +rp.lp3_params( + df_max[df_max["adm1_en"] == sample_admin].value, est_method="scipy" +) + +# %% [markdown] +# We can also run them all at once with this wrapper + +# %% +rp.lp3_params_all(df_max[df_max["adm1_en"] == sample_admin].value) + +# %% [markdown] +# Use wrapper function with `groupby` to run for each admin + +# %% +df_params = ( + df_max.groupby("adm1_en")["value"] + .apply(rp.lp3_params_all) + .reset_index() + .rename(columns={"level_1": "method"}) +) + +# %% [markdown] +# ## Calculate RPs +# +# Now we can loop through each admin and calculate RPs associated with +# each value + +# %% +unique_adm1s = df_params["adm1_en"].unique() +df_list = [] + +for adm in unique_adm1s: + df_params_adm_filt = df_params[df_params["adm1_en"] == adm] + df_adm_filt = df[df["adm1_en"] == adm] + + est_methods = ["lmoments", "scipy", "usgs"] + + for method in est_methods: + df_adm_filt[f"rp_{method}"] = rp.lp3_rp( + x=df_adm_filt["value"], + params=df_params_adm_filt.loc[ + df_params_adm_filt["method"] == method, "value" + ].iloc[0], + est_method=method, + ) + + df_list.append(df_adm_filt) + +# Concatenate all the dataframes in the list +df_combined = pd.concat(df_list, ignore_index=True) + + +# %% [markdown] +# Plot all RPs from each method - can see that they are generally similar, +# but plot is not very useful. + +# %% +df_combined_sample = df_combined[df_combined["adm1_en"] == sample_admin] + +# Plot RP values against 'value' for different estimation methods +plt.figure(figsize=(10, 6)) + +# Plot for lmoments +plt.scatter( + df_combined_sample["value"], + df_combined_sample["rp_lmoments"], + color="blue", + label="RP Lmoments", + alpha=0.5, +) + +# Plot for scipy +plt.scatter( + df_combined_sample["value"], + df_combined_sample["rp_scipy"], + color="green", + label="RP Scipy", + alpha=0.5, +) + +# Plot for usgs +plt.scatter( + df_combined_sample["value"], + df_combined_sample["rp_usgs"], + color="red", + label="RP USGS", + alpha=0.5, +) + +plt.xlabel("Value") +plt.ylabel("Return Period (RP)") +plt.title("Return Periods (RP) vs Value") +plt.legend() +plt.xscale("log") +plt.yscale("log") +plt.grid(True, which="both", ls="--") +plt.show() + + +# %% [markdown] +# ## Plot RVs for Specified RPs +# +# for a better comparison we can calculate RVs for specified RPs `1-1000` +# for the sample admin + +# %% + +return_periods = np.arange(1, 10000) + +params_sample_lmom = df_params.loc[ + (df_params["method"] == "lmoments") + & (df_params["adm1_en"] == sample_admin), + "value", +].iloc[0] +params_sample_usgs = df_params.loc[ + (df_params["method"] == "usgs") & (df_params["adm1_en"] == sample_admin), + "value", +].iloc[0] +params_sample_scipy = df_params.loc[ + (df_params["method"] == "scipy") & (df_params["adm1_en"] == sample_admin), + "value", +].iloc[0] +# Calculate return values for each return period using lmoments method + +# import inspect +# inspect.getsourcelines(rp.lp3_rv) +return_values_lmom = rp.lp3_rv( + rp=return_periods, params=params_sample_lmom, est_method="lmoments" +) +return_values_usgs = rp.lp3_rv( + rp=return_periods, params=params_sample_usgs, est_method="usgs" +) +return_values_scipy = rp.lp3_rv( + rp=return_periods, params=params_sample_scipy, est_method="scipy" +) + +# Plot the return periods against the return values +plt.figure(figsize=(10, 6)) +plt.plot( + return_periods, return_values_lmom * 100, label="LMoments", color="blue" +) +plt.plot(return_periods, return_values_usgs * 100, label="USGS", color="red") +plt.plot( + return_periods, return_values_scipy * 100, label="SCIPY", color="green" +) +plt.xlabel("Return Period") +plt.ylabel("Return Value (%)") +plt.title("Return Values vs Return Periods") +plt.xscale("log") +plt.yscale("log") +plt.legend() +plt.grid(True, which="both", ls="--") +# Format y-axis to percentage +plt.gca().yaxis.set_major_formatter( + FuncFormatter(lambda y, _: "{:.0f}%".format(y)) +) +plt.show() diff --git a/requirements.txt b/requirements.txt index 1253904..eb0c5b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,6 @@ dask==2024.7.0 scipy==1.14.1 pre-commit==3.8.0 netCDF4==1.7.2 +scipy==1.14.1 +lmoments3==1.0.7 +pyarrow==17.0.0 diff --git a/src/utils/return_periods.py b/src/utils/return_periods.py new file mode 100644 index 0000000..9d65bdb --- /dev/null +++ b/src/utils/return_periods.py @@ -0,0 +1,128 @@ +import numpy as np +import pandas as pd +from lmoments3 import distr, lmom_ratios +from scipy.stats import norm, pearson3, skew + + +def lp3_params(x, est_method="lmoments"): + x = np.asarray(x) + x[x == 0] = np.min(x[x != 0]) + x_sorted = np.sort(x) + x_log = np.log10(x_sorted) + + if est_method == "lmoments": + lmoments = lmom_ratios(x_log, nmom=4) + params = distr.pe3.lmom_fit(x_log, lmoments) # , lmom_ratios=lmom + elif est_method == "scipy": + params = pearson3.fit(x_log, method="MM") + elif est_method == "usgs": + params = { + "mu": np.mean(x_log), + "sd": np.std(x_log, ddof=1), + "g": skew(x_log), + } + else: + raise ValueError( + "Invalid est_method specified. Choose 'usgs','lmoments' or 'scipy'." # noqa: E501 + ) + return params + + +def lp3_params_all(value): + methods = ["usgs", "lmoments", "scipy"] + params = {method: lp3_params(value, method) for method in methods} + return pd.Series(params) + + +def lp3_rp(x, params, est_method="lmoments"): + x = np.asarray(x) + x_sorted = np.sort(x) + x_log = np.log10(x_sorted) + + if est_method == "scipy": + # Calculate the CDF for the log-transformed data + p_lte = pearson3.cdf(x_log, *params) + + elif est_method == "lmoments": + p_lte = distr.pe3.cdf(x_log, **params) + + elif est_method == "usgs": + g = params["g"] + mu = params["mu"] + stdev = params["sd"] + # Step 1: From quantiles_rp to y_fit_rp + y_fit_rp = x_log + + # Step 2: From y_fit_rp to k_rp + k_rp = (y_fit_rp - mu) / stdev + + # Step 3: From k_rp to q_rp_norm + q_rp_norm = (g / 6) + ((k_rp * g / 2 + 1) ** (1 / 3) - 1) * 6 / g + + # Step 4: From q_rp_norm to rp_exceedance + p_lte = norm.cdf(q_rp_norm, loc=0, scale=1) + + else: + raise ValueError( + "Invalid package specified. Choose 'distr' , 'usgs',or 'scipy'." + ) + + # Calculate the return periods + p_gte = 1 - p_lte + rp = 1 / p_gte + + return rp + + +def lp3_rv(rp, params, est_method="usgs"): + """ + Calculate return values for given return periods using the + Log-Pearson Type III distribution. + + Parameters: + rp (list or array-like): List of return periods. + params (dict or tuple): Parameters for the distribution. + - For 'usgs' method, a dictionary with keys 'g' (skewness), + 'mu' (mean), and 'sd' (standard deviation). + - For 'lmoments' method, a dictionary with parameters for the + Log Pearson Type III distribution. + - For 'scipy' method, a tuple with parameters for the + Log Pearson Type III distribution. + est_method (str, optional): Method to estimate the return values. + Options are 'usgs', 'lmoments', or 'scipy'. Default is 'usgs'. + + Returns: + numpy.ndarray: Return values corresponding to the given return periods. + + Raises: + ValueError: If an invalid estimation method is provided. + """ + est_method = est_method.lower() + if est_method not in ["usgs", "lmoments", "scipy"]: + raise ValueError( + "Invalid method. Choose 'usgs' or 'lmoments' or 'scipy'." + ) + rp_exceedance = [1 / rp for rp in rp] + + if est_method == "usgs": + g = params["g"] + mu = params["mu"] + stdev = params["sd"] + + q_rp_norm = norm.ppf( + 1 - np.array(rp_exceedance), loc=0, scale=1 + ) # Normal quantiles + k_rp = (2 / g) * ( + ((q_rp_norm - (g / 6)) * (g / 6) + 1) ** 3 - 1 + ) # Skewness adjustment + y_fit_rp = mu + k_rp * stdev # Fitted values for return periods + ret = 10 ** (y_fit_rp) + # return return_value_lp3_usgs(x, rp) + elif est_method == "lmoments": + value_log = distr.pe3.ppf(1 - np.array(rp_exceedance), **params) + ret = 10 ** (value_log) + elif est_method == "scipy": + value_log = pearson3.ppf(1 - np.array(rp_exceedance), *params) + ret = 10 ** (value_log) + + return ret