From 5b7e9956b2c71b6a5224223d7251cf58dd973004 Mon Sep 17 00:00:00 2001 From: Pedro Capelastegui Date: Mon, 24 Sep 2018 15:47:03 +0100 Subject: [PATCH] closes #3; (#4) --- CONTRIBUTING.md | 52 + anticipy/__init__.py | 3 + anticipy/app.py | 124 ++ anticipy/forecast.py | 1173 +++++++++++++++ anticipy/forecast_models.py | 1549 +++++++++++++++++++ anticipy/forecast_plot.py | 266 ++++ anticipy/model_utils.py | 291 ++++ anticipy/utils_test.py | 122 ++ setup.py | 6 +- tests/__init__.py | 1 + tests/data/candy_production.csv | 549 +++++++ tests/data/df_test_naive.csv | 79 + tests/data/df_test_naive2.csv | 58 + tests/data/test_normalize.csv | 45 + tests/test_forecast.py | 2513 +++++++++++++++++++++++++++++++ tests/test_forecast_model.py | 900 +++++++++++ tests/test_forecast_plot.py | 157 ++ tests/test_model_utils.py | 192 +++ 18 files changed, 8077 insertions(+), 3 deletions(-) create mode 100644 CONTRIBUTING.md create mode 100644 anticipy/__init__.py create mode 100644 anticipy/app.py create mode 100644 anticipy/forecast.py create mode 100644 anticipy/forecast_models.py create mode 100644 anticipy/forecast_plot.py create mode 100644 anticipy/model_utils.py create mode 100644 anticipy/utils_test.py create mode 100644 tests/__init__.py create mode 100755 tests/data/candy_production.csv create mode 100644 tests/data/df_test_naive.csv create mode 100644 tests/data/df_test_naive2.csv create mode 100644 tests/data/test_normalize.csv create mode 100644 tests/test_forecast.py create mode 100644 tests/test_forecast_model.py create mode 100644 tests/test_forecast_plot.py create mode 100644 tests/test_model_utils.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..172bf4a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,52 @@ +# Contributing + +Contributions are welcomed! + +When contributing to this repository, please first discuss the change you wish to make via GitHub +issue before making a change. This saves everyone from wasted effort in the event that the proposed +changes need some adjustment before they are ready for submission. + +## Pull Request Process + +1. If your changes include multiple commits, please squash them into a single commit. Stack Overflow + and various blogs can help with this process if you're not already familiar with it. +2. Update the README.md where relevant. +3. You may merge the Pull Request in once you have the sign-off of two other developers, or if you + do not have permission to do that, you may request the second reviewer to merge it for you. + +## Contributor Code of Conduct + +As contributors and maintainers of this project, and in the interest of fostering an open and +welcoming community, we pledge to respect all people who contribute through reporting issues, +posting feature requests, updating documentation, submitting pull requests or patches, and other +activities. + +We are committed to making participation in this project a harassment-free experience for everyone, +regardless of level of experience, gender, gender identity and expression, sexual orientation, +disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery +* Personal attacks +* Trolling or insulting/derogatory comments +* Public or private harassment +* Publishing other's private information, such as physical or electronic addresses, without explicit + permission +* Other unethical or unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, +code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. By +adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently +applying these principles to every aspect of managing this project. Project maintainers who do not +follow or enforce the Code of Conduct may be permanently removed from the project team. + +This code of conduct applies both within project spaces and in public spaces when an individual is +representing the project or its community. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an +issue or contacting one or more of the project maintainers. + +This Code of Conduct is adapted from the [Contributor Covenant](http://contributor-covenant.org), +version 1.2.0, available at +[http://contributor-covenant.org/version/1/2/0/](http://contributor-covenant.org/version/1/2/0/) \ No newline at end of file diff --git a/anticipy/__init__.py b/anticipy/__init__.py new file mode 100644 index 0000000..b8d1eaf --- /dev/null +++ b/anticipy/__init__.py @@ -0,0 +1,3 @@ +import pkg_resources +__version__ = pkg_resources.require(__name__)[0].version +del pkg_resources diff --git a/anticipy/app.py b/anticipy/app.py new file mode 100644 index 0000000..e8e8723 --- /dev/null +++ b/anticipy/app.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +# +# License: This module is released under the terms of the LICENSE file +# contained within this applications INSTALL directory + +""" + __high_level_module_description_here__ +""" + +# -- Coding Conventions +# http://www.python.org/dev/peps/pep-0008/ - Use the Python style guide +# http://sphinx.pocoo.org/rest.html - Use Restructured Text for docstrings + +# -- Public Imports +import logging +import pandas as pd +import os +import forecast +import forecast_plot +import argparse + +# -- Private Imports + +# -- Globals +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +# -- Exception classes + + +# -- Functions +def logger_info(msg, data): + # Convenience function for easier log typing + logger.info(msg + '\n%s', data) + + +def run_forecast_app(path_in, path_out=None, forecast_years=2.0, + col_name_y='y', col_name_weight='weight', + col_name_x='x', col_name_date='date', + col_name_source='source', + include_all_fits=False + ): + assert path_in is not None and os.path.exists(path_in), 'path_in needs to be a string pointing to a valid file path' + assert not os.path.isdir(path_in) + + file_name = os.path.basename(path_in) + file_name_p1 = file_name.split('.')[0] + + logger_info('file_name', file_name) + logger_info('file_name p1', file_name_p1) + + if path_out is None: + path_out = path_in + assert os.path.exists(path_out) + + path_folder = os.path.dirname(path_out) + + logger_info('dir name', path_folder) + + path_data = os.path.join(path_folder, file_name_p1+'_fcast.csv') + path_metadata = os.path.join(path_folder, file_name_p1+'_metadata.csv') + path_plot = os.path.join(path_folder, file_name_p1 + '_fcast.png') + + logger_info('path_data', path_data) + logger_info('path_metadata', path_metadata) + logger_info('path_plot', path_plot) + + df_y = pd.read_csv(path_in) + + if col_name_date in df_y: # Need to parse date + df_y[col_name_date] = df_y[col_name_date].pipe(pd.to_datetime) + + df_y = forecast.normalize_df(df_y, col_name_y, col_name_weight, col_name_x, col_name_date, + col_name_source) + + dict_result = forecast.run_forecast(df_y, extrapolate_years=forecast_years, simplify_output=False, + include_all_fits=include_all_fits) + + df_result = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + df_result.to_csv(path_data, index=False) + df_metadata.to_csv(path_metadata, index=False) + + try: + forecast_plot.plot_forecast_save(df_result, path_plot, width=1920, height=1080) + except AssertionError: + logger.info("Couldn't generate plot - R not installed") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--path_in', help='Path of input .csv file') + parser.add_argument('--path_out', help='Path of output folder - defaults to folder of path_in', default=None) + parser.add_argument('--forecast_years', help='Years in forecast interval', default=2.0, type=float) + parser.add_argument('--col_name_y', help='Name of column for y', default='y') + parser.add_argument('--col_name_date', help='Name of column for date', default='date') + parser.add_argument('--col_name_weight', help='Name of column for weight', default='weight') + parser.add_argument('--col_name_source', help='Name of column for y', default='source') + parser.add_argument('--col_name_x', help='Name of column for x', default='x') + parser.add_argument('--include_all_fits', help='If true, output includes non-optimal models', action='store_true') + + args = parser.parse_args() + logger.info('Input: path_in= %s', args.path_in) + logger.info('Input: path_out= %s', args.path_out) + logger.info('Input: col_name_y= %s', args.col_name_y) + logger.info('Input: col_name_date= %s', args.col_name_date) + logger.info('Input: col_name_x= %s', args.col_name_x) + logger.info('Input: col_name_weight= %s', args.col_name_weight) + logger.info('Input: col_name_source= %s', args.col_name_source) + logger.info('Input: include_all_fits= %s', args.include_all_fits) + + run_forecast_app(args.path_in, args.path_out, args.forecast_years, + args.col_name_y, args.col_name_weight, args.col_name_x, args.col_name_date, + args.col_name_source, args.include_all_fits) + + # run_forecast_app('/Users/pec21/Downloads/file1.csv','/Users/pec21/Downloads/', + # col_name_y='occup_erl', col_name_source='bend_name') + + +# -- Main +if __name__ == '__main__': + main() diff --git a/anticipy/forecast.py b/anticipy/forecast.py new file mode 100644 index 0000000..84a5331 --- /dev/null +++ b/anticipy/forecast.py @@ -0,0 +1,1173 @@ +# -*- coding: utf-8 -*- +# +# License: This module is released under the terms of the LICENSE file +# contained within this applications INSTALL directory + +""" +Functions to run forecast +""" + +# -- Coding Conventions +# http://www.python.org/dev/peps/pep-0008/ - Use the Python style guide +# http://sphinx.pocoo.org/rest.html - Use Restructured Text for docstrings + +# -- Public Imports +import logging +import numpy as np +import pandas as pd +import scipy +from scipy import optimize +import itertools + + +# -- Private Imports +from anticipy import forecast_models, model_utils + +# -- Globals +from anticipy.model_utils import detect_freq + +logger = logging.getLogger(__name__) + + +# -- Exception classes + +# -- Functions +def logger_info(msg, data): + # Convenience function for easier log typing + logger.info(msg + '\n%s', data) + + +# Utility functions +def to_str_function_list(l_function): + if l_function is None: + return None + return [f.name if f is not None else None for f in l_function] + +def is_null_model(f_model): + return f_model.name == 'null' + + +def _is_multi_ts(a): + return a.ndim > 1 and a.shape[1] > 1 + + +def _has_different_res_weights(res_weights): + # Check if a residuals parameter is a vector with length > 1 + return res_weights is not None and hasattr(res_weights, "__getitem__") and len(res_weights) > 1 + + +# TODO: replace get_residuals with this +def get_residuals(params, model, a_x, a_y, a_date, a_weights=None, filter_null_residuals=None, df_actuals=None): + """ + Given a time series, a model function and a set of parameters, get the residuals + + :param params: parameters for model function + :type params: numpy array + :param model: model function. Usage: model(a_x, a_date, params) + :type model: function + :param a_x: X axis for model function. + :type a_x: float array + :param a_y: Input time series values, to compare to the model function + :type a_y: float array + :param a_date: Dates for the input time series + :type a_date: datetime array + :param a_weights: weights for each individual sample + :type a_weights: numpy array + + :return: array with residuals, same length as a_x, a_y + :rtype: float array + """ + # Note: remove this assert for performance + assert a_y.ndim == 1 + # Note: none of the input arrays should include NaN values + # We do not check this with asserts due to performance - this function is in the optimization loop + + y_predicted = model(a_x, a_date, params, df_actuals=df_actuals) + residuals = (a_y - y_predicted) + if a_weights is not None: # Do this only if different residual weights + residuals = residuals * a_weights + result = np.abs(residuals) + return result + + +def optimize_least_squares(model, a_x, a_y, a_date, a_weights=None, f_t_scaling=None, df_actuals=None): + """ + Given a time series and a model function, find the set of parameters that minimises residuals + + :param model: model function, to be fitted against the actuals + :type model: function + :param a_x: + :type a_x: float array + :param a_y: + :type a_y: float array + :param a_date: + :type a_date: datetime array + :param res_weights: + :type res_weights: + :param use_t_scaling: + :type use_t_scaling: + :param bounds: + :type bounds: 2-tuple of array_like + :return: + | table(success, params, cost, optimality, iterations, status, jac_evals, message): + | - success (bool): True if successful fit + | - params (list): Parameters of fitted model + | - cost (float): Value of cost function + | - optimality(float) + | - iterations (int) : Number of function evaluations + | - status (int) : Status code + | - jac_evals(int) : Number of Jacobian evaluations + | - message (str) : Output message + :rtype: pandas.DataFrame + """ + assert a_y.ndim == 1 + + # Check that input is sorted - not required - taken care by normalize_df() + # assert np.all(np.diff(a_x) >= 0), 'Input not sorted on x axis' + + # Ask the model to provide an initial guess + initial_guess = model.f_init_params(a_x, a_y) + + bounds = model.f_bounds(a_x, a_y) + + assert forecast_models.validate_initial_guess(initial_guess, bounds), \ + 'Initial guess outside of bounds: {} - {}, {}'.format(model, initial_guess, bounds) + + # In multi-ts scenarios, we apply this filter to ignore residuals for null y_values + filter_null_residuals = ~np.isnan(a_y) + if np.all(filter_null_residuals): + filter_null_residuals = None + + # t_scaling: we use this to assign different weight to residuals based on date # TODO: Implement scaling functions + if f_t_scaling: + a_weights_tmp = f_t_scaling(a_x) + a_weights = a_weights_tmp if a_weights is None else a_weights*a_weights_tmp + + # Set up arguments for get_residuals + f_model_args = (model, a_x, a_y, a_date) + + result = scipy.optimize.least_squares(get_residuals, initial_guess, + args=f_model_args, + kwargs={'a_weights': a_weights, 'df_actuals':df_actuals}, + # method='lm', + method='trf', + x_scale='jac', + # verbose=1, + bounds=bounds + ) + dict_result_df = { + 'optimality': result['optimality'], + 'success': result['success'], + 'cost': result['cost'], + 'iterations': result['nfev'], + 'jac_evals': result['njev'], + 'status': result['status'], + 'message': result['message'], + 'params': [result['x']] + } + df_result = pd.DataFrame(data=dict_result_df, index=pd.Index([0])) + df_result = df_result[['success', 'params', 'cost', 'optimality', 'iterations', 'status', 'jac_evals', 'message']] + return df_result + + +def _get_df_fit_model(source, model, weights, actuals_x_range, freq, + is_fit, cost, aic_c, params, status): + if params is None: + params = np.array([]) + df_result = ( + pd.DataFrame(columns=['source', 'model', 'weights', 'actuals_x_range', 'freq', + 'is_fit', 'cost', 'aic_c', 'params_str', 'status', 'source_long', 'params'], + data=[[source, model, weights, actuals_x_range, freq, + is_fit, cost, aic_c, np.array_str(params, precision=1), status, + '{}:{}:{}:{}'.format(source, weights, freq, actuals_x_range), + params + ]]) + ) + return df_result + + +def _get_empty_df_result_optimize(source, model, status, weights, freq, actuals_x_range): + source_long = '{}:{}:{}:{}'.format(source, weights, freq, actuals_x_range) + return pd.DataFrame(columns=['source', 'model', 'success', 'params_str', 'cost', 'optimality', 'iterations', + 'status', 'jac_evals', 'message', 'source_long', 'params'], + data=[[source, model, False, '[]', np.NaN, np.NaN, np.NaN, status, np.NaN, status, + source_long, []]]) + + +def normalize_df(df_y, + col_name_y='y', + col_name_weight='weight', + col_name_x='x', + col_name_date='date', + col_name_source='source'): + """ + Converts an input dataframe for run_forecast() into a normalized format suitable for fit_model() + + :param df_y: + :type df_y: pandas.DataFrame + :param col_name_y: + :type col_name_y: str + :param col_name_weight: + :type col_name_weight: str + :param col_name_x: + :type col_name_x: str + :param col_name_date: + :type col_name_date: str + """ + + assert df_y is not None + if df_y.empty: + return None + + if isinstance(df_y, pd.Series): + df_y = df_y.to_frame() + assert isinstance(df_y, pd.DataFrame) + assert col_name_y in df_y.columns, 'Dataframe needs to have a column named "{}"'.format(col_name_y) + df_y = df_y.copy() + + # Rename columns to normalized values + rename_col_dict = { + col_name_y:'y', + col_name_weight:'weight', + col_name_x:'x', + col_name_date:'date', + col_name_source:'source' + } + df_y = df_y.copy().rename(rename_col_dict, axis=1) + + # Placeholder - need to replace all references to col_name_z with z + col_name_y = 'y' + col_name_weight = 'weight' + col_name_x = 'x' + col_name_date = 'date' + col_name_source = 'source' + + # Ensure y column is float + df_y[col_name_y] = df_y[col_name_y].astype(float) + + multiple_sources = col_name_source in df_y.columns + l_sources = df_y[col_name_source].drop_duplicates() if multiple_sources else ['test_source'] + + l_df_results = [] + for source in l_sources: + df_y_tmp = df_y.loc[df_y[col_name_source] == source].copy() if multiple_sources else df_y + # Setup date, x columns + if col_name_date not in df_y.columns and isinstance(df_y.index, pd.DatetimeIndex): # use index as i_date + df_y_tmp[col_name_date] = df_y_tmp.index + elif col_name_date in df_y.columns: # Ensure that date column is timestamp dtype + df_y_tmp[col_name_date] = df_y_tmp[col_name_date].pipe(pd.to_datetime) + + #if isinstance(df_y_tmp.index, pd.DatetimeIndex): + # We don't need a date index after this point + df_y_tmp = df_y_tmp.reset_index(drop=True) + + if col_name_x not in df_y_tmp.columns: + if col_name_date in df_y_tmp.columns: + # Need to extract numeric index from a_date + df_date_interp = ( + df_y_tmp[[col_name_date]].drop_duplicates().pipe(model_utils.interpolate_df).rename_axis(col_name_x).reset_index() + ) + df_y_tmp = ( + df_date_interp.merge(df_y_tmp) + ) + else: # With no date, extract column x from a numeric index + df_y_tmp[col_name_x] = df_y_tmp.index + + l_df_results += [df_y_tmp] + + # Rename columns to normalized values + rename_col_dict = { + col_name_y: 'y', + col_name_weight: 'weight', + col_name_x: 'x', + col_name_date: 'date', + col_name_source: 'source' + } + + df_result = pd.concat(l_df_results, sort=False, ignore_index=True) #.rename(rename_col_dict, axis=1) + + # Sort columns, filter unused columns + df_result = df_result[[c for c in ['date', 'source', 'x', 'y', 'weight'] if c in df_result.columns]] + sort_columns = ['source','x'] if 'source' in df_result.columns else ['x'] + df_result = df_result.sort_values(sort_columns).reset_index(drop=True) + return df_result + + +def fit_model(model, df_y, freq='W', source='test', df_actuals=None): + """ + Given a time series and a model, optimize model parameters and return + + :param model: + :type model: function + :param df_y: + | Dataframe with the following columns: + | - y: + | - date: (optional) + | - weight: (optional) + | - x: (optional) + :type df_y: pandas.DataFrame + :param source: + :type source: + :param freq: 'W' or 'D' . Used only for metadata + :type freq: str + :return: table (source, model_name, y_weights , freq, is_fit, aic_c, params) + :rtype: pandas.DataFrame + + This function calls optimize_least_squares() to perform the optimization loop. It performs some cleaning up of input + and output parameters. + """ + col_name_y = 'y' + col_name_weight = 'weight' + col_name_x = 'x' + col_name_date = 'date' + + assert df_y is not None and isinstance(df_y, pd.DataFrame) and col_name_y in df_y.columns + + # Setup + f_model_name = model.name + n_params = model.n_params + + df_y = df_y.copy() + # Filter out any sample where df_y is null + df_y = df_y.loc[~df_y[col_name_y].pipe(pd.isna)] + + # Filter out any sample where a_weights is 0 + if col_name_weight in df_y.columns: + df_y[col_name_weight] = df_y[col_name_weight].fillna(0) + df_y = df_y.loc[df_y[col_name_weight] >= 0] + + # Metadata + if col_name_weight not in df_y.columns: + weights = '1' + else: + weights = '{}-{}'.format(df_y[col_name_weight].min(), df_y[col_name_weight].max()) + + # Residual normalization + if df_y[col_name_x].duplicated().any(): + df_k = df_y.groupby(col_name_x).size().rename('k_weight_normalize').reset_index() + df_y = df_y.merge(df_k) + if col_name_weight not in df_y: + df_y[col_name_weight] = 1.0 + # Adjust residual weight based on number of values per sample + # E.g. a sample with 2 values in the input series will multiply residuals by 0.5 + df_y[col_name_weight] = df_y[col_name_weight]/df_y['k_weight_normalize'] + + # Get input arrays + a_y = df_y[col_name_y].values + a_x = model_utils.apply_a_x_scaling(df_y[col_name_x].values, model) + a_weights = df_y[col_name_weight].values if col_name_weight in df_y.columns else None + # Need to convert series to DatetimeIndex + i_date = pd.DatetimeIndex(df_y[col_name_date]) if col_name_date in df_y.columns else None + + # Metadata + cost = np.NaN + is_fit = False + params = [] + # Get first and last actuals date, for metadata. If no a_date, use a_x instead. + date_start_actuals = i_date.min().date() if i_date is not None else a_x.min() + date_end_actuals = i_date.max().date() if i_date is not None else a_x.max() + actuals_x_range = '{}::{}'.format(date_start_actuals, date_end_actuals) + + if df_y.empty: + logger.info('Cannot fit - empty df_y: %s', source) + status = 'EMPTY_TS' + df_result = _get_df_fit_model(source, model.name, weights, actuals_x_range, freq, + is_fit, cost, np.NaN, None, status) + df_result_optimize = _get_empty_df_result_optimize(source, model, status, weights, freq, actuals_x_range) + + elif a_x.size < n_params + 2: + logger.info('Not enough samples in source %s for %s: %s (needs %s)', + source, f_model_name, a_x.size, n_params + 2) + status = 'TS_TOO_SHORT' + df_result = _get_df_fit_model(source, model.name, weights, actuals_x_range, freq, + is_fit, cost, np.NaN, None, status) + df_result_optimize = _get_empty_df_result_optimize(source, model, status, weights, freq, actuals_x_range) + else: # Get results + model = forecast_models.simplify_model(model, a_x, a_y, i_date) + + if model.n_params==0: + # 0-parameter model, cannot be fit + #logger.info('Model has 0 parameters - no fitting required') + + a_residuals = get_residuals(None, model, a_x, a_y, i_date, a_weights, df_actuals=df_actuals) + cost = 0.5*np.nansum(a_residuals**2) + is_fit = True + params = np.array([]) + status = 'FIT' + + # Process results + + aic_c = model_utils.get_aic_c(cost, len(df_y), n_params) + + df_result = _get_df_fit_model(source, model.name, weights, actuals_x_range, freq, + is_fit, cost, aic_c, params, status) + + dict_result_df = { + 'optimality': 0., + 'success': True, + 'cost':cost, + 'iterations': 0., + 'jac_evals': 0., + 'status': 0, + 'message': 'Naive model fitted', + 'params': '-' + } + df_result_optimize = pd.DataFrame(data=dict_result_df, index=pd.Index([0])) + df_result_optimize = df_result_optimize[ + ['success', 'params', 'cost', 'optimality', 'iterations', 'status', 'jac_evals', 'message']] + df_result_optimize['source'] = source + df_result_optimize['source_long'] = df_result.source_long.iloc[0] + df_result_optimize['model'] = model + df_result_optimize['params_str'] = df_result.params_str.iloc[0] + df_result_optimize = df_result_optimize[ + ['source', 'model', 'success', 'params_str', 'cost', 'optimality', 'iterations', + 'status', 'jac_evals', 'message', 'source_long', 'params']] + else: + df_result_optimize = optimize_least_squares(model, a_x, a_y, i_date, a_weights, df_actuals=df_actuals) + cost = df_result_optimize.cost.iloc[0] + is_fit = df_result_optimize.success.iloc[0] + params = df_result_optimize.params.iloc[0] + status = 'FIT' if is_fit else 'NO-FIT' + + # Process results + if status in ['FIT','NO-FIT']: + aic_c = model_utils.get_aic_c(cost, len(df_y), n_params) + else: + aic_c = np.NaN + + df_result = _get_df_fit_model(source, model.name, weights, actuals_x_range, freq, + is_fit, cost, aic_c, params, status) + + df_result_optimize['source'] = source + df_result_optimize['source_long'] = df_result.source_long.iloc[0] + df_result_optimize['model'] = model + df_result_optimize['params_str'] = df_result.params_str.iloc[0] + df_result_optimize = df_result_optimize [['source','model','success','params_str','cost','optimality','iterations', + 'status','jac_evals','message','source_long','params']] + + dict_result = {'metadata':df_result, 'optimize_info':df_result_optimize} + return dict_result + + +def extrapolate_model(model, params, date_start_actuals, date_end_actuals, freq='W', extrapolate_years=2.0, + x_start_actuals=0., df_actuals=None): + """ + Given a model and a set of parameters, generate model output for a date range plus a number of additional years. + + :param model: + :type model: + :param params: + :type params: + :param date_start_actuals: + :type date_start_actuals: + :param date_end_actuals: + :type date_end_actuals: + :param freq: + :type freq: + :param extrapolate_years: + :type extrapolate_years: + :return: + :rtype: + """ + s_x = model_utils.get_s_x_extrapolate(date_start_actuals, date_end_actuals, model=model, freq=freq, + extrapolate_years=extrapolate_years, x_start_actuals=x_start_actuals) + a_y_forecast = model(s_x.values, s_x.index, params, df_actuals=df_actuals) + s_y_forecast = pd.Series(data=a_y_forecast, index=s_x.index, name='y') + df_y_forecast = pd.DataFrame(s_y_forecast) + return df_y_forecast + + +def get_list_model(l_model_trend, l_model_season, season_add_mult='both'): + if l_model_season is None or len(l_model_season) < 1: + l_model_tmp = l_model_trend + elif l_model_trend is None or len(l_model_trend) < 1: + l_model_tmp = l_model_season + else: + l_model_tmp = [] + if season_add_mult != 'mult': # 'add' or 'both' + l_model_tmp += [model_trend+model_season for model_trend, model_season in + itertools.product(l_model_trend, l_model_season)] + if season_add_mult != 'add': # 'mult' or 'both' + l_model_tmp += [model_trend*model_season for model_trend, model_season in + itertools.product(l_model_trend, l_model_season)] + + l_model_tmp = pd.Series(l_model_tmp).drop_duplicates().tolist() + return l_model_tmp + + +def get_df_actuals_clean(df_actuals, source, source_long): + """ + + :param df_actuals: dataframe in normalized format, with columns y and optionally x, date, weight + :type df_actuals: + :param source: + :type source: + :param source_long: + :type source_long: + :return: + :rtype: + """ + # Add actuals as entries in result dicts + df_actuals = df_actuals.copy() # .rename_axis('date') + if 'date' not in df_actuals.columns: + df_actuals = df_actuals.rename({'x': 'date'}, axis=1) + df_actuals = df_actuals[[c for c in ['date', 'weight', 'y'] if c in df_actuals.columns]] + + df_actuals['model']='actuals' + df_actuals['source'] = source + df_actuals['source_long'] = source_long + df_actuals['is_actuals'] = True + if not 'weight' in df_actuals.columns: + df_actuals['weight'] = 1.0 + return df_actuals + + +def _get_df_fcast_clean(df_fcast, source, source_long,model): + # TODO: cleanup + # This removes any forecast samples with null values, e.g. from naive models + df_fcast = df_fcast.loc[ ~df_fcast.y.pipe(pd.isnull)] + df_fcast = df_fcast.copy().rename_axis('date').reset_index() + df_fcast['source'] = source + df_fcast['source_long'] = source_long + df_fcast['model'] = model + df_fcast['is_actuals'] = False + df_fcast['weight'] = 1.0 + return df_fcast + + +""" +# TODO: api improvements: +- change default df format to have columns: x,y, date, weight +- currently, we assume a datetimeindex + +""" + + +def run_forecast_from_input_list(l_dict_input): + # Run forecasts from a list of dictionaries with keyword arguments + + # Handle both scalars and list-likes + s_input = pd.Series(l_dict_input) + + l_dict_result = [] + for dict_input in l_dict_input: + dict_result_tmp = run_forecast(**dict_input) + l_dict_result += [dict_result_tmp] + + # Generate output + return aggregate_forecast_dict_results(l_dict_result) + + +def run_forecast(df_y, l_model_trend=None, l_model_season=None, + date_start_actuals=None, source_id='src', + col_name_y='y', col_name_weight='weight', + col_name_x='x', col_name_date='date', + col_name_source='source', + extrapolate_years=0, season_add_mult='add', + include_all_fits=False, + simplify_output=True, + do_find_steps_and_spikes=False, + find_outliers=False, + l_season_yearly=None, + l_season_weekly=None, + verbose=None, + l_model_naive=None + ): + """ + Generate forecast for one or more input time series + + :return: + :rtype: + :param df_y: + :type df_y: + :param l_model_trend: + :type l_model_trend: + :param l_model_season: + :type l_model_season: + :param date_start_actuals: + :type date_start_actuals: + :param source_id: + :type source_id: + :param col_name_y: + :type col_name_y: + :param col_name_weight: + :type col_name_weight: + :param col_name_x: + :type col_name_x: + :param col_name_date: + :type col_name_date: + :param col_name_source: + :type col_name_source: + :param return_all_models: + | If True, result includes non-fitting models, with null AIC and an empty forecast df. + | Otherwise, result includes only fitting models, and for time series where no fitting model is available, + | a 'no-best-model' entry with null AIC and an empty forecast df is added. + :type return_all_models: bool + :param return_all_fits: If True, result includes all models for each input time series. Otherwise, only the + best model is included. + :type return_all_fits: bool + :param extrapolate_years: + :type extrapolate_years: float + :param season_add_mult: 'add', 'mult', or 'both'. Whether forecast seasonality will be additive, multiplicative, + or the best fit of the two. + :type season_add_mult: str + :param fill_gaps_y_values: If True, gaps in time series will be filled with NaN values + :type fill_gaps_y_values: bool + :param freq: 'W' or 'D' . Sampling frequency of the output forecast: weekly or daily. + :type freq: str + :param do_find_steps_and_spikes: if True, find steps and spikes, create fixed models and add them + to the list of models + :type do_find_steps_and_spikes: bool + :param find_outliers: + :type find_outliers: + :param include_all_fits: + :type include_all_fits: + :param simplify_output: If False, return dict with forecast and metadata. Otherwise, return only forecast. + :type simplify_output: bool + :return: + :rtype: + """ + # TODO: Add check for non-duplicate source ids + l_dict_result = [] + + df_y = normalize_df(df_y, col_name_y, col_name_weight, col_name_x, col_name_date, col_name_source) + if df_y is None: # Empty input + return None + + if 'source' not in df_y.columns: + return run_forecast_single(df_y, + l_model_trend, + l_model_season, + date_start_actuals, + source_id, + extrapolate_years, + season_add_mult, + include_all_fits, + simplify_output, + do_find_steps_and_spikes, + find_outliers, + l_season_yearly, + l_season_weekly, + l_model_naive=l_model_naive + ) + else: + for src_tmp in df_y.source.drop_duplicates(): + if verbose: + logger.info('Running forecast for source: %s', src_tmp) + df_y_tmp = df_y.loc[df_y.source==src_tmp].reset_index(drop=True) + dict_result_tmp = run_forecast_single(df_y_tmp, + l_model_trend, + l_model_season, + date_start_actuals, + src_tmp, + extrapolate_years, + season_add_mult, + include_all_fits, + False, # Simplify output + do_find_steps_and_spikes, + find_outliers, + l_season_yearly, + l_season_weekly, + l_model_naive=l_model_naive + ) + l_dict_result += [dict_result_tmp] + # Generate output + dict_result = aggregate_forecast_dict_results(l_dict_result) + if simplify_output: + return dict_result.get('forecast') + else: + return dict_result + + +def aggregate_forecast_dict_results(l_dict_result): + l_df_data = [] + l_df_metadata = [] + l_df_optimize_info = [] + # Forecast with prediction interval + l_df_forecast = [] + + for dict_result in l_dict_result: + l_df_data += [dict_result['data']] + l_df_metadata += [dict_result['metadata']] + l_df_optimize_info += [dict_result['optimize_info']] + l_df_forecast += [dict_result['forecast']] + + # Generate output + df_data = pd.concat(l_df_data, sort=False, ignore_index=True) + df_metadata = pd.concat(l_df_metadata, sort=False, ignore_index=True) + df_optimize_info = pd.concat(l_df_optimize_info, sort=False, ignore_index=True) + df_forecast = pd.concat(l_df_forecast, sort=False, ignore_index=True) + + return {'forecast': df_forecast, 'data': df_data, 'metadata': df_metadata, 'optimize_info': df_optimize_info} + +def run_forecast_single(df_y, + l_model_trend=None, + l_model_season=None, + date_start_actuals=None, + source_id='src', + extrapolate_years=0, + season_add_mult='add', + include_all_fits=False, + simplify_output=True, + do_find_steps_and_spikes=False, + find_outliers=False, + l_season_yearly=None, + l_season_weekly=None, + l_model_naive=None + ): + """ + + :param df_y: + :type df_y: + :param l_model_trend: + :type l_model_trend: + :param l_model_season: + :type l_model_season: + :param date_start_actuals: + :type date_start_actuals: + :param source_id: + :type source_id: + :param col_name_y: + :type col_name_y: + :param col_name_weight: + :type col_name_weight: + :param col_name_x: + :type col_name_x: + :param col_name_date: + :type col_name_date: + :param return_all_models: + | If True, result includes non-fitting models, with null AIC and an empty forecast df. + | Otherwise, result includes only fitting models, and for time series where no fitting model is available, + | a 'no-best-model' entry with null AIC and an empty forecast df is added. + :type return_all_models: bool + :param return_all_fits: If True, result includes all models for each input time series. Otherwise, only the + best model is included. + :type return_all_fits: bool + :param extrapolate_years: + :type extrapolate_years: float + :param season_add_mult: 'add', 'mult', or 'both'. Whether forecast seasonality will be additive, multiplicative, + or the best fit of the two. + :type season_add_mult: str + :param fill_gaps_y_values: If True, gaps in time series will be filled with NaN values + :type fill_gaps_y_values: bool + :param freq: 'W' or 'D' . Sampling frequency of the output forecast: weekly or daily. + :type freq: str + :param do_find_steps_and_spikes: if True, find steps and spikes, create fixed models and add them + to the list of models + :type do_find_steps_and_spikes: bool + :return: + :rtype: + """ + l_df_data = [] + l_df_metadata = [] + l_df_optimize_info = [] + + # Each element in l_fcast_input describes all model configurations for a source time series + source = source_id + + if 'date' in df_y.columns: + freq = detect_freq(df_y.date) + else: + freq = None + + df_y=df_y.copy() + df_y_unfiltered = df_y.copy() + + if date_start_actuals is not None and 'date' in df_y.columns: # Filter: only actuals after date_start_actuals + df_y = df_y.loc[df_y.date >= date_start_actuals] + + date_start_actuals = df_y.date.min() if 'date' in df_y.columns else df_y.x.min() + date_end_actuals = df_y.date.max() if 'date' in df_y.columns else df_y.x.max() + + # If we find outliers, we add a model with dummy variables for the outliers + if find_outliers: + model_outliers, outlier_mask = forecast_models.get_model_outliers(df_y) + if outlier_mask is not None: + if 'weight' in df_y.columns: + df_y['weight'] = df_y['weight'] * outlier_mask + else: + df_y['weight'] = outlier_mask + assert np.issubdtype(df_y.weight.astype(float), np.float64) + else: + model_outliers = None + + # Add actuals to output + # Get weight for metadata + if 'weight' not in df_y.columns: + df_y['weight']=1 + weights = '1' + else: + weights = '{}-{}'.format(df_y['weight'].min(), df_y['weight'].max()) + + # Get long source_id + if isinstance(date_start_actuals, pd.datetime): + date_start_actuals_short = date_start_actuals.date() + date_end_actuals_short = date_end_actuals.date() + else: + date_start_actuals_short = date_start_actuals + date_end_actuals_short = date_end_actuals + actuals_x_range = '{}::{}'.format(date_start_actuals_short, date_end_actuals_short) + source_long = '{}:{}:{}:{}'.format(source, weights, freq, actuals_x_range) + df_actuals = get_df_actuals_clean(df_y, source, source_long) + l_df_data+=[df_actuals] + + if l_model_trend is None: + # By default, try linear and piecewise linear + l_model_trend = [ + #forecast_models.model_naive, + forecast_models.model_linear, + forecast_models.model_linear+forecast_models.model_ramp] + l_model_season_add = None + l_model_season_mult = None + if l_model_season is None: + if 'date' in df_y.columns: + s_date_tmp = df_y.date + if 'weight' in df_y.columns: + s_date_tmp = s_date_tmp.loc[df_y.weight>0] + + l_model_season_add = forecast_models.get_l_model_auto_season(s_date_tmp,season_add_mult='add', + l_season_yearly=l_season_yearly, + l_season_weekly=l_season_weekly, + ) + l_model_season_mult = forecast_models.get_l_model_auto_season(s_date_tmp,season_add_mult='mult', + l_season_yearly=l_season_yearly, + l_season_weekly=l_season_weekly, + ) + else: + l_model_season_add = l_model_season + l_model_season_mult = l_model_season + + + l_model_add = get_list_model(l_model_trend, l_model_season_add, 'add') + l_model_mult = get_list_model(l_model_trend, l_model_season_mult, 'mult') + + if season_add_mult == 'add': + l_model = l_model_add + elif season_add_mult == 'mult': + l_model = l_model_mult + else: # both + l_model = np.unique([l_model_add+l_model_mult]).tolist() + # logger_info('debug l_Model',l_model) + if l_model_naive is not None: + l_model = l_model_naive+l_model + + # if model_outliers is not None: + # l_model_outlier = [forecast_models.model_null, model_outliers] + # l_model = get_list_model(l_model, l_model_outlier, 'add') + + if do_find_steps_and_spikes: + a_y = df_y.y.values + a_x = df_y.y + + a_date = df_y.date if 'date' in df_y.columns else None + + steps, spikes = forecast_models.find_steps_and_spikes(a_x, a_y, a_date) + if steps: + steps_summed = reduce(lambda x, y: x + y, steps) + steps_summed.name = '{}_fixed_steps'.format(len(steps)) + l_model = [model + steps_summed for model in l_model] + if spikes: + spikes_mult = reduce(lambda x, y: x * y, spikes) + spikes_mult.name = '{}_fixed_spikes'.format(len(spikes)) + # filter values during the spike + a_y_filt = spikes_mult(a_x, a_date, []) + df_y[a_y_filt == 0] = np.nan + + # exclude samples with weight = 0 + df_y = df_y.loc[df_y.weight > 0] + date_start_actuals = df_y.date.min() if 'date' in df_y.columns else df_y.x.min() + x_start_actuals = df_y.x.min() + + df_actuals_cols = [c for c in ['date','x'] if c in df_y.columns] + + df_actuals_interpolated = ( # Fills gaps, used for extrapolation + df_y_unfiltered + .merge(df_y_unfiltered[df_actuals_cols].drop_duplicates('x').pipe(model_utils.interpolate_df), how='right') + .sort_values(['x']).reset_index(drop=True) + ) + # Update weight column in df_actuals_interpolated + df_actuals_interpolated = df_actuals_interpolated.drop(columns=['weight'],errors='ignore') + df_actuals_interpolated = df_actuals_interpolated.merge(df_y[['x','weight']],how='left') + df_actuals_interpolated['weight']=df_actuals_interpolated.weight.fillna(0) + + # Note - In the above steps, we first remove any samples with weight = 0 + # from the data used for fitting + # then we fill gaps in dates from the table used for extrapolating. + # The filled gaps have NaN values in the y column, 0 weight + + for model in l_model: + + dict_fit_model = fit_model(model, df_y, freq, source, df_actuals=df_y_unfiltered) + df_metadata_tmp = dict_fit_model['metadata'] + df_optimize_info = dict_fit_model['optimize_info'] + + l_df_metadata += [df_metadata_tmp] + l_df_optimize_info += [df_optimize_info] + source_long = df_metadata_tmp.source_long.iloc[0] + params = df_metadata_tmp.params.iloc[0] + + if df_metadata_tmp.is_fit.iloc[0]: # If model is fit + + # date_start_actuals = df_y.date.min() + # date_end_actuals = df_y.date.max() + + df_data_tmp = extrapolate_model(model, params, + date_start_actuals,date_end_actuals, + freq, extrapolate_years, x_start_actuals = x_start_actuals, + df_actuals=df_actuals_interpolated) + + df_data_tmp = _get_df_fcast_clean(df_data_tmp, source_id, source_long, model.name) + + l_df_data += [df_data_tmp] + + # Generate output + df_data = pd.concat(l_df_data, sort=False, ignore_index=True) + df_metadata = pd.concat(l_df_metadata, sort=False, ignore_index=True) + df_optimize_info = pd.concat(l_df_optimize_info, sort=False, ignore_index=True) + + # Determine best fits + df_best_fit = ( + df_metadata.loc[df_metadata.is_fit] + .sort_values('aic_c') + .groupby('source', as_index=False).first() + [['source_long', 'model']] + ) + df_best_fit['is_best_fit'] = True + + df_metadata = df_metadata.merge(df_best_fit, how='left') + df_metadata['is_best_fit'] = df_metadata['is_best_fit'].fillna(False).astype(bool) + df_data = df_data.merge(df_best_fit, how='left').reset_index(drop=True) + df_data['is_best_fit'] = df_data['is_best_fit'].fillna(False).astype(bool) + + if not include_all_fits: + df_metadata = df_metadata.loc[df_metadata.is_best_fit].reset_index(drop=True) + df_data = df_data.loc[df_data.is_best_fit | df_data.is_actuals].reset_index(drop=True) + + df_forecast = df_data.pipe(get_pi, n=100) + dict_result = {'forecast': df_forecast, 'data': df_data, 'metadata': df_metadata, 'optimize_info': df_optimize_info} + + if simplify_output: + return df_forecast + else: + return dict_result + + +# TODO: Better define return_all_fits, return_all_models. Document and provide clear use cases +# TODO: Improve test, make shorter +def run_l_forecast(l_fcast_input, + col_name_y='y', col_name_weight='weight', + col_name_x='x', col_name_date='date', + col_name_source='source', + extrapolate_years=0, season_add_mult='add', + include_all_fits=False, + do_find_steps_and_spikes=False, + find_outliers=False): + """ + Generate forecasts for a list of SolverConfig objects, each including a time series, model functions, and other + configuration parameters. + + :param l_fcast_input: List of forecast input configurations. Each element includes a time series, + candidate forecast models for trend and seasonality, and other configuration parameters. For each input + configuration, a forecast time series will be generated. + :type l_fcast_input: list of ForecastInput + :param return_all_models: + | If True, result includes non-fitting models, with null AIC and an empty forecast df. + | Otherwise, result includes only fitting models, and for time series where no fitting model is available, + | a 'no-best-model' entry with null AIC and an empty forecast df is added. + :type return_all_models: bool + :param return_all_fits: If True, result includes all models for each input time series. Otherwise, only the + best model is included. + :type return_all_fits: bool + :param extrapolate_years: + :type extrapolate_years: float + :param season_add_mult: 'add', 'mult', or 'both'. Whether forecast seasonality will be additive, multiplicative, + or the best fit of the two. + :type season_add_mult: str + :param fill_gaps_y_values: If True, gaps in time series will be filled with NaN values + :type fill_gaps_y_values: bool + :param freq: 'W' or 'D' . Sampling frequency of the output forecast: weekly or daily. + :type freq: str + :return: + | dict(data,metadata) + | data: dataframe(date, source, model, y) + | metadata: dataframe('source', 'model', 'res_weights', 'freq', 'is_fit', 'cost', 'aic_c', 'params', 'status') + :rtype: dict + + """ + # TODO: Add check for non-duplicate source ids + l_df_data = [] + l_df_metadata = [] + l_df_optimize_info = [] + + # We can take solver_config_list that are a list or a single forecast_input + if type(l_fcast_input) is not list: + l_fcast_input = [l_fcast_input] + + l_dict_result = [] + for fcast_input in l_fcast_input: + dict_result = run_forecast(fcast_input.df_y, fcast_input.l_model_trend, fcast_input.l_model_season, + fcast_input.date_start_actuals, fcast_input.source_id, + col_name_y, col_name_weight, + col_name_x, col_name_date, + col_name_source, + extrapolate_years, season_add_mult, + include_all_fits, simplify_output=False, + do_find_steps_and_spikes=do_find_steps_and_spikes, + find_outliers=find_outliers) + l_dict_result += [dict_result] + + # Generate output + return aggregate_forecast_dict_results(l_dict_result) + + +# Forecast configuration + +# TODO: Rename to ForecastInput +class ForecastInput: + """ + Class that encapsulates input variables for forecast.run_forecast() + """ + + def __init__(self, source_id, df_y, l_model_trend=None, l_model_season=None, + weights_y_values=1.0, date_start_actuals=None): + self.source_id = source_id + self.df_y = df_y + self.l_model_trend = l_model_trend if l_model_trend is not None else [forecast_models.model_linear] + self.l_model_season = l_model_season + self.weights_y_values = weights_y_values + self.date_start_actuals = date_start_actuals + + def __str__(self): + str_result = ( + 'SolverConfig: {source_id} ; {df_y_shape} ; {weights_y_values};' + ' {l_model_trend}; {l_model_season} ; {date_start_actuals}' + ).format(source_id=self.source_id, df_y_shape=self.df_y.shape, + l_model_trend=to_str_function_list(self.l_model_trend), + l_model_season=to_str_function_list(self.l_model_season), + weights_y_values=self.weights_y_values, date_start_actuals=self.date_start_actuals) + return str_result + + def __repr__(self): + return self.__str__() + + # TODO: REMOVE + @classmethod + def create(cls, source_id, df_y, l_model_trend, l_model_season=None, + weights_y_values=1.0, date_start_actuals=None): + return cls(source_id, df_y, pd.Series(l_model_trend), l_model_season, + weights_y_values, date_start_actuals) + + +""" +Draft for a parallel computing version: +run_forecast_parallel(n) +- take solver_config_list, split into n parts +- open n processes for run_forecast, each with 1/n of solver_config_list +- merge outputs: a dict with a pd.concat() of each output dataframe +- challenge: pickling objects: solver_config_list, pandas dataframe +- potential solution: have solver_config_list replace dataframes with file paths + +""" + + +def get_pi(df_forecast, n=100): + if 'source' in df_forecast.columns and df_forecast.source.nunique() > 1: + df_result = ( + df_forecast + .groupby('source', as_index=False) + .apply(_get_pi_single_source, n) + .sort_values(['source', 'is_actuals', 'date']) + .reset_index(drop=True) + ) + else: + df_result = _get_pi_single_source(df_forecast, n) + return df_result + + +# TODO: Test +def _get_pi_single_source(df_forecast, n=100): + # n: Number of bootstrapped samples for prediction interval + + if 'is_best_fit' in df_forecast.columns: + df_forecast = df_forecast.loc[df_forecast.is_actuals | df_forecast.is_best_fit].copy() + else: + df_forecast = df_forecast.copy() + + if 'source' in df_forecast.columns: + l_cols = ['date', 'source'] + else: + l_cols = ['date'] + + # logger_info('DEBUG - df_forecast', df_forecast.head(1)) + if 'is_weight' in df_forecast.columns and df_forecast.is_weight.any(): + + # Filter out dates for outliers with weight=0 + df_filtered_dates = ( + df_forecast.loc[df_forecast.is_weight & df_forecast.y > 0] + [['date', 'source']] + ) + + # Take filtered actuals + df_actuals_unfiltered = df_forecast.loc[df_forecast.is_actuals & ~df_forecast.is_weight & + ~df_forecast.y.isnull()] + df_actuals = (df_actuals_unfiltered[['date', 'y']] + .merge(df_filtered_dates, how='inner') + .rename({'y': 'actuals'}, axis=1) + ) + date_last_actuals = df_actuals.date.max() + else: # No weight data - use all actuals rows + df_actuals_unfiltered = df_forecast.loc[df_forecast.is_actuals & ~df_forecast.y.isnull()] + df_actuals = (df_actuals_unfiltered[['date', 'y']] + .rename({'y': 'actuals'}, axis=1) + ) + date_last_actuals = df_actuals.date.max() + # Compute residuals for filtered actuals + df_residuals_tmp = df_forecast.loc[~df_forecast.is_actuals & ~df_forecast.y.pipe(pd.isnull)][l_cols+['model', 'y']] + + df_residuals = df_residuals_tmp.merge(df_actuals, how='inner') + df_residuals['res'] = df_residuals['actuals'] - df_residuals['y'] + + # Filter out null values, e.g. due to null actuals + df_residuals = df_residuals.loc[~df_residuals.res.isnull()] + + if df_residuals.empty: # May happen if no forecast could be generated + logger.warning('No forecast data for source %s', df_forecast.source.head(1).iloc[0]) + return df_actuals_unfiltered[l_cols + ['is_actuals', 'model', 'y']] + + # Generate table with prediction interval + df_forecast_pi = ( + df_forecast + .loc[~df_forecast.is_actuals & (df_forecast.date > date_last_actuals)] + [l_cols+['model','y']] + ) + + s_residuals_tmp = df_residuals.res + a_forecast_point = df_forecast_pi.y.values + + length = a_forecast_point.size + + a_sample = s_residuals_tmp.sample(length * n, replace=True).values.reshape(n, length) + a_sample = np.cumsum(a_sample, axis=1) + + a_q5 = np.percentile(a_sample, 5, axis=0) + a_q95 = np.percentile(a_sample, 95, axis=0) + a_q80 = np.percentile(a_sample, 80, axis=0) + a_q20 = np.percentile(a_sample, 20, axis=0) + + df_forecast_pi['q5'] = a_q5 + df_forecast_pi.y + df_forecast_pi['q20'] = a_q20 + df_forecast_pi.y + df_forecast_pi['q80'] = a_q80 + df_forecast_pi.y + df_forecast_pi['q95'] = a_q95 + df_forecast_pi.y + df_forecast_pi['is_actuals'] = False + + # Past forecast samples, no prediction interval + df_forecast_past = ( + df_forecast + .loc[~df_forecast.is_actuals & (df_forecast.date <= date_last_actuals)] + [l_cols+['model', 'is_actuals', 'y']] + ) + + df_actuals_unfiltered = df_actuals_unfiltered[l_cols + ['is_actuals', 'model', 'y']] + df_pi_result = pd.concat([df_actuals_unfiltered, df_forecast_past, df_forecast_pi, ], sort=False, ignore_index=True) + + return df_pi_result diff --git a/anticipy/forecast_models.py b/anticipy/forecast_models.py new file mode 100644 index 0000000..8b46d1f --- /dev/null +++ b/anticipy/forecast_models.py @@ -0,0 +1,1549 @@ +# -*- coding: utf-8 -*- +# +# License: This module is released under the terms of the LICENSE file +# contained within this applications INSTALL directory + +""" +Defines the ForecastModel class, which encapsulates model functions used in forecast model fitting, as well as +their number of parameters and initialisation parameters. +""" + +# TODO: Check parameter initialisation + +# TODO: It may be convenient to have different model functions for addition and subtraction, +# e.g. to return 1 or 0 by default + +# -- Coding Conventions +# http://www.python.org/dev/peps/pep-0008/ - Use the Python style guide +# http://sphinx.pocoo.org/rest.html - Use Restructured Text for docstrings + +# -- Public Imports +import logging +import numpy as np +import pandas as pd +import itertools + +# -- Private Imports +from anticipy import model_utils + +# -- Globals +logger = logging.getLogger(__name__) +dict_fourier = { + 'period': 365.25, # days in year + 'harmonics': 10 # TODO: evaluate different harmonics values +} + + +# -- Exception classes + +# -- Functions +def logger_info(msg, data): + # Convenience function for easier log typing + logger.info(msg + '\n%s', data) + + +def _is_multi_ts(a): + return a.ndim > 1 and a.shape[1] > 1 + + +def _get_f_init_params_default(n_params): + # Generate a default function for initialising model parameters: use random values between 0 and 1 + return lambda a_x=None, a_y=None, a_date=None, is_mult=False: np.random.uniform(low=0.001, high=1, size=n_params) + + +def _get_f_bounds_default(n_params): + # Generate a default function for model parameter boundaries. Default boundaries are (-inf, inf) + return lambda a_x=None, a_y=None, a_date=None: (n_params * [-np.inf], n_params * [np.inf]) + + +def _get_f_add_2_f_models(forecast_model1, forecast_model2): + def f_add_2_f_models(a_x, a_date, params, is_mult=False, **kwargs): + params1 = params[0:forecast_model1.n_params] + params2 = params[forecast_model1.n_params:] + return ( + forecast_model1.f_model(a_x, a_date, params1, is_mult=False, **kwargs) + + forecast_model2.f_model(a_x, a_date, params2, is_mult=False, **kwargs) + ) + + return f_add_2_f_models + + +def _get_f_mult_2_f_models(forecast_model1, forecast_model2): + def f_mult_2_f_models(a_x, a_date, params, is_mult=False, **kwargs): + params1 = params[0:forecast_model1.n_params] + params2 = params[forecast_model1.n_params:] + return ( + forecast_model1.f_model(a_x, a_date, params1, is_mult=True, **kwargs) * + forecast_model2.f_model(a_x, a_date, params2, is_mult=True, **kwargs) + ) + + return f_mult_2_f_models + + +def _get_f_add_2_f_init_params(f_init_params1, f_init_params2): + def f_add_2_f_init_params(a_x, a_y, a_date=None, is_mult=False): + return np.concatenate([f_init_params1(a_x, a_y, a_date, is_mult=False), + f_init_params2(a_x, a_y, a_date, is_mult=False)]) + return f_add_2_f_init_params + +def _get_f_mult_2_f_init_params(f_init_params1, f_init_params2): + def f_mult_2_f_init_params(a_x, a_y, a_date=None, is_mult=False): + return np.concatenate([f_init_params1(a_x, a_y, a_date, is_mult=True), + f_init_params2(a_x, a_y, a_date, is_mult=True)]) + return f_mult_2_f_init_params + + +def _get_f_concat_2_bounds(forecast_model1, forecast_model2): + def f_add_2_f_bounds(a_x, a_y, a_date=None): + return np.concatenate((forecast_model1.f_bounds(a_x, a_y, a_date), + forecast_model2.f_bounds(a_x, a_y, a_date)), axis=1) + + return f_add_2_f_bounds + + +# def _get_add_2_bounds(f_bounds1, f_bounds2): +# return np.concatenate((f_bounds1, f_bounds2), axis=1) + + +# -- Classes + +class ForecastModel: + """ + Class that encapsulates model functions for use in forecasting, as well as + their number of parameters and functions for parameter initialisation. + + A ForecastModel instance is initialized with a model name, a number of model parameters, and a model function. + Class instances are callable - when called as a function, their internal model function is used. The main purpose + of ForecastModel objects is to generate predicted values for a time series, given a set of parameters. + These values can be compared to the original series to get an array of residuals:: + + y_predicted = model(a_x, a_date, params) + residuals = (a_y - y_predicted) + + This is used in an optimization loop to obtain the optimal parameters for the model. + + The reason for using this class instead of raw model functions is that ForecastModel supports function composition:: + + model_sum = fcast_model1 + fcast_model2 # fcast_model 1 and 2 are ForecastModel instances, and so is model_sum + a_y1 = fcast_model1(a_x, a_date, params1) + fcast_model2(a_x, a_date, params2) + params = np.concatenate([params1, params2]) + a_y2 = model_sum(a_x, a_date, params) + a_y1 == a_y2 # True + + Forecast models can be added or multiplied, with the + and * operators. Multiple levels of composition are + supported:: + + model = (model1 + model2) * model3 + + Model composition is used to aggregate trend and seasonality model components, among other uses. + + Model functions have the following signature: + + - f(a_x, a_date, params, is_mult) + - a_x : array of floats + - a_date: array of dates, same length as a_x. Only required for date-aware models, e.g. for weekly seasonality. + - params: array of floats - model parameters - the optimisation loop updates this to fit our actual values. Each + model function uses a fixed number of parameters. + - is_mult: boolean. True if the model is being used with multiplicative composition. Required because + some model functions (e.g. steps) have different behaviour + when added to other models than when multiplying them. + - returns an array of floats - with same length as a_x - output of the model defined by this object's + modelling function f_model and the current set of parameters + + By default, model parameters are initialized as random values between 0 and 1. + It is possible to define a parameter initialization function that picks initial values + based on the original time series. + This is passed during ForecastModel creation with the argument f_init_params. + Parameter initialization is compatible with model composition: + the initialization function of each component will be used for that component's parameters. + + Parameter initialisation functions have the following signature: + + - f_init_params(a_x, a_y, is_mult) + - a_x: array of floats - same length as time series + - a_y: array of floats - time series values + - returns an array of floats - with length equal to this object's n_params value + + By default, model parameters have no boundaries. + However, it is possible to define a boundary function for a model, + that sets boundaries for each model parameter, based on the input time series. + This is passed during ForecastModel creation with the argument f_bounds. + Boundary definition is compatible with model composition: + the boundary function of each component will be used for that component's parameters. + + Boundary functions have the following signature: + + - f_bounds(a_x, a_y, a_date) + - a_x: array of floats - same length as time series + - a_y: array of floats - time series values + - a_date: array of dates, same length as a_x. Only required for date-aware models, e.g. for weekly seasonality. + - returns a tuple of 2 arrays of floats. The first defines minimum parameter boundaries, and the second + the maximum parameter boundaries. + + Our input time series should meet the following constraints: + + - Minimum required samples depends on number of model parameters + - May include null values + - May include multiple values per sample + - A date array is only required if the model is date-aware + + Class Usage:: + + model_x = ForecastModel(name, n_params, f_model, f_init_params) + model_name = model_x.name # Get model name + n_params = model_x.n_params # Get number of model parameters + f_init_params = model_x.f_init_params # Get parameter initialisation function + init_params = f_init_params(t_values, y_values) # Get initial parameters + f_model = model_x.f_model # Get model fitting function + y = f_model(a_x, a_date, parameters) # Get model output + + The following pre-generated models are available. They are available as attributes from this module: + + .. csv-table:: Forecast models + :header: "name", "params", "formula","notes" + :widths: 20, 10, 20, 40 + + "model_null",0, "y=0", "Does nothing. Used to disable components (e.g. seasonality)" + "model_constant",1, "y=A", "Constant model" + "model_linear",2, "y=Ax + B", "Linear model" + "model_linear_nondec",2, "y=Ax + B", "Non decreasing linear model. With boundaries to ensure model slope >=0" + "model_quasilinear",3, "y=A*(x^B) + C", "Quasilinear model" + "model_exp",2, "y=A * B^x", "Exponential model" + "model_step",2, "y=0 if x=A", "Step model" + "model_two_steps",4, "see model_step", "2 step models. Parameter initialization is aware of # of steps." + "model_sigmoid_step",3, "y = A + (B - A) / (1 + np.exp(- D * (x - C)))", "Sigmoid step model" + "model_sigmoid",3, "y = A + (B - A) / (1 + np.exp(- D * (x - C)))", "Sigmoid model" + "model_season_wday",7, "see desc.", "Weekday seasonality model. Assigns a constant value to each weekday" + "model_season_wday",6, "see desc.", "6-param weekday seasonality model. As above, with one constant set to 0." + "model_season_wday_2",2, "see desc.", "Weekend seasonality model. Assigns a constant to each of weekday/weekend" + "model_season_month",12, "see desc.", "Month seasonality model. Assigns a constant value to each month" + "model_season_fourier_yearly",10, "see desc", "Fourier yearly seasonality model" + + """ + + def __init__(self, name, n_params, f_model, f_init_params=None, f_bounds=None): + """ + Create ForecastModel + + :param name: + :type name: + :param n_params: + :type n_params: + :param f_model: + :type f_model: + :param f_init_params: + :type f_init_params: + :param f_bounds: + :type f_bounds: + """ + self.name = name + self.n_params = n_params + self.f_model = f_model + if f_init_params is not None: + self.f_init_params = f_init_params + else: + # Default initial parameters: random values between 0 and 1 + self.f_init_params = _get_f_init_params_default(n_params) + + if f_bounds is not None: + self.f_bounds = f_bounds + else: + self.f_bounds = _get_f_bounds_default(n_params) + + # TODO - REMOVE THIS - ASSUME NORMALIZED INPUT + def _get_f_init_params_validated(f_init_params): + # Adds argument validation to a parameter initialisation function + def f_init_params_validated(a_x=None, a_y=None, a_date=None, is_mult=False): + if a_x is not None and pd.isnull(a_x).any(): + raise ValueError('a_x cannot have null values') + return f_init_params(a_x, a_y, a_date, is_mult) + + return f_init_params_validated + + # Add logic to f_init_params that validates input + self.f_init_params = _get_f_init_params_validated(self.f_init_params) + + def __call__(self, a_x, a_date, params, is_mult=False, **kwargs): + # assert len(params)==self.n_params + return self.f_model(a_x, a_date, params, is_mult, **kwargs) + + def __str__(self): + return self.name + + def __repr__(self): + return 'ForecastModel:{}'.format(self.name) + + def __add__(self, forecast_model): + # Check for nulls + if self.name == 'null': + return forecast_model + if forecast_model.name == 'null': + return self + name = '({}+{})'.format(self.name, forecast_model.name) + n_params = self.n_params + forecast_model.n_params + f_model = _get_f_add_2_f_models(self, forecast_model) + f_init_params = _get_f_add_2_f_init_params(self.f_init_params, forecast_model.f_init_params) + f_bounds = _get_f_concat_2_bounds(self, forecast_model) + return ForecastModel(name, n_params, f_model, f_init_params, + f_bounds=f_bounds) + + def __radd__(self, other): + return self.__add__(other) + + def __mul__(self, forecast_model): + if self.name == 'null': + return forecast_model + if forecast_model.name == 'null': + return self + name = '({}*{})'.format(self.name, forecast_model.name) + n_params = self.n_params + forecast_model.n_params + f_model = _get_f_mult_2_f_models(self, forecast_model) + f_init_params = _get_f_mult_2_f_init_params(self.f_init_params, forecast_model.f_init_params) + f_bounds = _get_f_concat_2_bounds(self, forecast_model) + return ForecastModel(name, n_params, f_model, f_init_params, f_bounds) + + def __rmul__(self, other): + return self.__mul__(other) + + def __eq__(self, other): + if isinstance(self, other.__class__): + return self.name == other.name + return NotImplemented + + def __ne__(self, other): + x = self.__eq__(other) + if x is not NotImplemented: + return not x + return NotImplemented + + def __hash__(self): + return hash(self.name) + + def __lt__(self, other): + return self.name < other.name + +# - Null model: 0 + +def _f_model_null(a_x, a_date, params, is_mult=False, **kwargs): + # This model does nothing - used to disable model components (e.g. seasonality) when adding/multiplying + # multiple functions + return float(is_mult) # Returns 1 if multiplying, 0 if adding + + +model_null = ForecastModel('null', 0, _f_model_null) + + +# - Constant model: :math:`Y = A` + +def _f_model_constant(a_x, a_date, params, is_mult=False, **kwargs): + [A] = params + y = np.full(len(a_x), A) + return y + + +def _f_init_params_constant(a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None: + return np.random.uniform(0, 1, 1) + else: + return np.nanmean(a_y) + np.random.uniform(0, 1, 1) + + +model_constant = ForecastModel('constant', 1, _f_model_constant, _f_init_params_constant) + + +# - Naive model: Y = Y(x-1) +# Note: This model requires passing the actuals data - it is not fitted by regression +# We still pass it to forecast.fit_model() for consistency with the rest of the library + +def _f_model_naive(a_x, a_date, params, is_mult=False, df_actuals=None): + if df_actuals is None: + raise ValueError('model_naive requires a df_actuals argument') + df_out_tmp = pd.DataFrame({'date':a_date,'x':a_x}) + df_out = ( + df_actuals.drop_duplicates('x') # This is not really intended to work with multiple values per sample + .merge(df_out_tmp, how='outer') + ) + df_out['y'] = df_out.y.shift(1).fillna(method='ffill').fillna(method='bfill') + df_out = df_out.loc[df_out.x.isin(a_x)] + #df_out = df_out_tmp.merge(df_out, how='left') # TODO: CHECK THAT X,DATE order is preserved + # TODO: df_out = df_out.merge(df_out_tmp, how='right') + return df_out.y.values + +model_naive = ForecastModel('naive',0, _f_model_naive) + +# - Seasonal naive model +# Note: This model requires passing the actuals data - it is not fitted by regression +# We still pass it to forecast.fit_model() for consistency with the rest of the library + +def _fillna_wday(df): + df = df.copy() + df['wday'] = df.date.dt.weekday + df_tmp = df[['date', 'x']].copy() + for wday in np.arange(0, 7): + wday_name = 'wday_{}'.format(wday) + df_tmp[wday_name] = ( + df.y.where((df.wday) == wday, np.NaN) # for each wday column, set to null all values from other weekdays + .fillna(method='ffill') # fill nulls with last weekly sample + .shift(1) # shift so that model for each sample is last non-null weekly sample + .where(df.wday == wday, np.NaN) # set values for other weekdays to null, so we can aggregate with sum + ) + # logger_info('debug: df_tmp: ', df_tmp) + + # Aggregate: add all weekly columns together, keep null only if all columns are null + def aggregate_wday(s_tmp): + if np.all(np.isnan(s_tmp)): + return np.NaN + else: + return np.nansum(s_tmp) + + df['y_out'] = df_tmp.loc[:, df_tmp.columns.str.startswith('wday_')].apply(aggregate_wday, axis=1) + return df + + +def _f_model_snaive_wday(a_x, a_date, params, is_mult=False, df_actuals=None): + if df_actuals is None: + raise ValueError('model_snaive_wday requires a df_actuals argument') + + df_actuals_model = _fillna_wday(df_actuals.drop_duplicates('x')) + + df_last_week = df_actuals_model.drop_duplicates('wday', keep='last')[['wday', 'y']] + df_last_week['y_out'] = df_last_week['y'] + df_last_week = df_last_week[['wday', 'y_out']] + + # logger_info('df_actuals_model:', df_actuals_model) + # logger_info('df_last_week:', df_last_week) + + df_out_tmp = pd.DataFrame({'date': a_date, 'x': a_x}) + df_out_tmp['wday'] = df_out_tmp.date.dt.weekday + + # logger_info('df_out_tmp:', df_out_tmp) + + df_out_actuals = ( + df_actuals_model.merge(df_out_tmp, how='left') + ) + df_out_extrapolated = ( + df_out_tmp.loc[~df_out_tmp.date.isin(df_actuals_model.date)] + .merge(df_last_week) + .sort_values('date').reset_index(drop=True) + ) + + df_out = pd.concat([df_out_actuals, df_out_extrapolated], sort=False) + + # logger_info('df_out_actuals:', df_out_actuals) + # logger_info('df_out_extrapolated:', df_out_extrapolated) + + # Note: the line below causes trouble when samples are filtered from a_x, a_date due to find_outliers + df_out = df_out.loc[df_out.x.isin(a_x)] + + # logger_info('df_out:', df_out) + + return df_out.y_out.values + +model_snaive_wday = ForecastModel('snaive_wday', 0, _f_model_snaive_wday) + + +# - Spike model: :math:`Y = A`, when x_min <= X < x_max +def _f_model_spike(a_x, a_date, params, is_mult=False, **kwargs): + [A, x_min, x_max] = params + if is_mult: + c = 1 + else: + c = 0 + y = np.concatenate(( + np.full(int(x_min), c), + np.full(int(x_max - x_min), A), + np.full(len(a_x) - int(x_max), c) + )) + return y + + +# TODO: test f_init_params for all models +def _f_init_params_spike(a_x=None, a_y=None, a_date=None, is_mult=False): + """ params are spike height, x start, x end """ + # if not a_y.any(): + if a_y is None: + return [1] + np.random.uniform(0, 1, 1) + [2] + else: + diffs = np.diff(a_y) + # if diffs: + if True: + diff = max(diffs) + x_start = np.argmax(diffs) + x_end = x_start + 1 + return np.array([diff, x_start, x_end]) + # else: + # rand = np.random.randint(1, len(a_y) - 1) + + return np.array([1, rand, rand + 1]) + + +model_spike = ForecastModel('spike', 3, _f_model_spike, _f_init_params_spike) + + +# - Spike model for dates - dates are fixed for each model + +def _f_model_spike_date(a_x, a_date, params, date_start, date_end, is_mult=False): + [A] = params + mask_spike = (a_date >= date_start) * (a_date < date_end) + if is_mult: + y = mask_spike * A + ~mask_spike + else: + y = mask_spike * A + + return y + + +def _f_init_params_spike(a_x=None, a_y=None, a_date=None, is_mult=False): + """ params are spike height, x start, x end """ + if a_y is None: + return np.concatenate([np.array([1]) + np.random.uniform(0, 1, 1)]) + else: + diffs = np.diff(a_y) + # if diffs: + if True: + diff = max(diffs) + return np.array([diff]) + # else: + # rand = np.random.randint(1, len(a_y) - 1) + # return [1] + + +def get_model_spike_date(date_start, date_end): + f_model = ( + lambda a_x, a_date, params, is_mult=False, **kwargs: + _f_model_spike_date(a_x, a_date, params, date_start, date_end, is_mult) + ) + model_spike_date = ForecastModel('spike_date[{},{}]'.format(pd.to_datetime(date_start).date(), + pd.to_datetime(date_end).date()), + 1, f_model, _f_init_params_spike) + return model_spike_date + + +# - Linear model: :math:`Y = A*x + B` + +def _f_model_linear(a_x, a_date, params, is_mult=False, **kwargs): + (A, B) = params + y = A * a_x + B + return y + + +def _f_init_params_linear(a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None: + return np.random.uniform(low=0, high=1, size=2) + else: # TODO: Improve this + if a_x is not None: + a_x_size = np.unique(a_x).size-1 + else: + a_x_size = a_y.size-1 + A = (a_y[-1]-a_y[0])/a_x_size + B = a_y[0] + # Uniform low= 0*m, high = 1*m + return np.array([A, B]) + + +model_linear = ForecastModel('linear', 2, _f_model_linear, _f_init_params_linear) + + +def f_init_params_linear_nondec(a_x=None, a_y=None, a_date=None, is_mult=False): + params = _f_init_params_linear(a_x, a_y, a_date) + if params[0] < 0: + params[0] = 0 + return params + + +def f_bounds_linear_nondec(a_x=None, a_y=None, a_date=None): + # first param should be between 0 and inf + return [0, -np.inf], [np.inf, np.inf] + + +model_linear_nondec = ForecastModel('linear', 2, _f_model_linear, + f_init_params=f_init_params_linear_nondec, + f_bounds=f_bounds_linear_nondec) + + +# - QuasiLinear model: :math:`Y = A t^{B} + C` + +def _f_model_quasilinear(a_x, a_date, params, is_mult=False, **kwargs): + (A, B, C) = params + y = A * np.power(a_x, B) + C + return y + + +model_quasilinear = ForecastModel('quasilinear', 3, _f_model_quasilinear) + + +# - Exponential model: math:: Y = A * B^t +def _f_model_exp(a_x, a_date, params, is_mult=False, **kwargs): + (A, B) = params + y = A * np.power(B, a_x) + return y + + +model_exp = ForecastModel('exponential', 2, _f_model_exp) + + +def f_init_params_exp_dec(a_x=None, a_y=None, a_date=None, is_mult=False): + """ B param must be <= 1 to have exponential decreasing """ + params = _get_f_init_params_default(2)(a_x, a_y, a_date) + return params + + +def f_bounds_exp_dec(a_x=None, a_y=None, a_date=None): + # first param should be between 0 and inf + return [-np.inf, -1], [np.inf, 1] + + +model_exp_dec = ForecastModel('exponential_dec', 2, _f_model_exp, + f_init_params=f_init_params_exp_dec, + f_bounds=f_bounds_exp_dec) + + +# - Step function: :math:`Y = {0, if x < A | B, if x >= A}` +# A is the time of step, and B is the step +def _f_step(a_x, a_date, params, is_mult=False, **kwargs): + (A, B) = params + if is_mult: + y = 1 + (B - 1) * np.heaviside(a_x - A, 1) + else: + y = B * np.heaviside(a_x - A, 1) + return y + +# TODO: Implement initialisation for multiplicative composition +def _f_init_params_step(a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None: + return np.random.uniform(0, 1, 2) + else: + if a_y.ndim > 1: + a_y = a_y[:, 0] + df = pd.DataFrame({'b': a_y}) + # max difference between consecutive values + df['diff'] = df.diff().abs() + # if is_mult, replace above line with something like np.concatenate([[np.NaN],a_y[:-1]/a_y[1:]]) + a = df.nlargest(1, 'diff').index[0] + b = df['diff'].iloc[a] + return np.array([a, b * 2]) + + +model_step = ForecastModel('step', 2, _f_step, _f_init_params_step) + + +# - Spike model for dates - dates are fixed for each model + +def _f_model_step_date(a_x, a_date, params, date_start, is_mult=False): + [A] = params + mask_step = (a_date>=date_start).astype(float) + if is_mult: + # y = mask_step*A + ~mask_step + y = mask_step * (A - 1) + 1 + else: + y = mask_step * A + + return y + + +# TODO: Implement initialisation for multiplicative composition +def _f_init_params_step_date(a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None: + return np.random.uniform(0, 1, 1) + else: + if a_y.ndim > 1: + a_y = a_y[:, 0] + df = pd.DataFrame({'b': a_y}) + # max difference between consecutive values + df['diff'] = df.diff().abs() + # if is_mult, replace above line with something like np.concatenate([[np.NaN],a_y[:-1]/a_y[1:]]) + a = df.nlargest(1, 'diff').index[0] + b = df['diff'].iloc[a] + return np.array([b * 2]) + + +def get_model_step_date(date_start): + date_start = pd.to_datetime(date_start) + f_model = ( + lambda a_x, a_date, params, is_mult=False, **kwargs: + _f_model_step_date(a_x, a_date, params, date_start, is_mult) + ) + model_step_date = ForecastModel('step_date[{}]'.format(date_start.date()), + 1, f_model, _f_init_params_step_date) + return model_step_date + + +# Two step functions +def _f_n_steps(n, a_x, a_date, params, is_mult=False): + if is_mult: + y = 1 + else: + y = 0 + + for i in range(0, n + 1, 2): + A, B = params[i: i + 2] + if is_mult: + y = y * _f_step(a_x, a_date, (A, B), is_mult) + else: + y = y + _f_step(a_x, a_date, (A, B), is_mult) + return y + + +def _f_two_steps(a_x, a_date, params, is_mult=False, **kwargs): + return _f_n_steps(n=2, a_x=a_x, a_date=a_date, params=params, is_mult=is_mult) + + +def _f_init_params_n_steps(n=2, a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None: + return np.random.uniform(0, 1, n * 2) + else: + # max difference between consecutive values + if a_y.ndim > 1: + a_y = a_y[:, 0] + df = pd.DataFrame({'b': a_y}) + df['diff'] = df.diff().abs() + # if is_mult, replace above line with something like np.concatenate([[np.NaN],a_y[:-1]/a_y[1:]]) + a = df.nlargest(n, 'diff').index[0:n].values + b = df['diff'].iloc[a].values + params = [] + for i in range(0, n): + params += [a[i], b[i]] + return np.array(params) + + +def _f_init_params_two_steps(a_x=None, a_y=None, a_date=None, is_mult=False): + return _f_init_params_n_steps(n=2, a_x=a_x, a_y=a_y, a_date=a_date, is_mult=is_mult) + + +model_two_steps = ForecastModel('two_steps', 2 * 2, _f_two_steps, _f_init_params_two_steps) + + +# - Sigmoid step function: :math:`Y = {A + (B - A) / (1 + np.exp(- D * (a_x - C)))}` +# Spans from A to B, C is the position of the step in x axis +# and D is how steep the increase is +def _f_sigmoid(a_x, a_date, params, is_mult=False, **kwargs): + (B, C, D) = params + if is_mult: + A = 1 + else: + A = 0 + # TODO check if a_x is negative + y = A + (B - A) / (1 + np.exp(- D * (a_x - C))) + return y + + +def _f_init_params_sigmoid_step(a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None: + return np.random.uniform(0, 1, 3) + else: + if a_y.ndim > 1: + a_y = a_y[:, 0] + df = pd.DataFrame({'y': a_y}) + # max difference between consecutive values + df['diff'] = df.diff().abs() + c = df.nlargest(1, 'diff').index[0] + b = df.loc[c, 'y'] + d = b * b + return b, c, d + + +def _f_init_bounds_sigmoid_step(a_x=None, a_y=None, a_date=None): + if a_y is None: + return [-np.inf, -np.inf, 0.], 3 * [np.inf] + + if a_y.ndim > 1: + a_y = a_y[:, 0] + if a_x.ndim > 1: + a_x = a_x[:, 0] + diff = max(a_y) - min(a_y) + b_min = -2 * diff + b_max = 2 * diff + c_min = min(a_x) + c_max = max(a_x) + d_min = 0. + d_max = np.inf + return [b_min, c_min, d_min], [b_max, c_max, d_max] + + +# In this model, parameter initialization is aware of number of steps +model_sigmoid_step = ForecastModel('sigmoid_step', 3, _f_sigmoid, _f_init_params_sigmoid_step, + f_bounds=_f_init_bounds_sigmoid_step) + +model_sigmoid = ForecastModel('sigmoid', 3, _f_sigmoid) + + +# Ramp functions - used for piecewise linear models + +# example : model_linear_pw2 = model_linear + model_ramp +# example 2: model_linear_p23 = model_linear + model_ramp + model_ramp + +# - Ramp function: :math:`Y = {0, if x < A | B, if x >= A}` +# A is the time of step, and B is the step +def _f_ramp(a_x, a_date, params, is_mult=False, **kwargs): + (A, B) = params + if is_mult: + y = 1 + (a_x - A) * (B) * np.heaviside(a_x - A, 1) + else: + y = (a_x - A) * B * np.heaviside(a_x - A, 1) + return y + + +def _f_init_params_ramp(a_x=None, a_y=None, a_date=None, is_mult=False): + # TODO: set boundaries: a_x (0.2, 0.8) + if a_y is None: + if a_x is not None: + nfirst_last = int(np.ceil(0.15 * a_x.size)) + a = np.random.uniform(a_x[nfirst_last],a_x[-nfirst_last-1],1) + else: + a = np.random.uniform(0, 1, 1) + b = np.random.uniform(0, 1, 1) + + return np.concatenate([a, + b]) + else: + df = pd.DataFrame({'b': a_y}) # TODO: FILTER A_Y BY 20-80 PERCENTILE IN A_X + if a_x is not None: + # + df['x']=a_x + # Required because we support input with multiple samples per x value + df = df.drop_duplicates('x') + df=df.set_index('x') + # max difference between consecutive values -- this assumes no null values in series + df['diff2'] = df.diff().diff().abs() + + # We ignore the last 15% of the time series + skip_samples = int(np.ceil(df.index.size * 0.15)) + + a = (df + .head(-skip_samples) + .tail(-skip_samples) + .nlargest(1, 'diff2').index[0] + ) + b = df['diff2'].loc[a] + # TODO: replace b with estimation of slope in segment 2 minus slope in segment 1 - see init_params_linear + # logger.info('DEBUG: init params ramp2: %s - %s ', a.tolist(),b.tolist()) + return np.array([a, b]) + + +def _f_init_bounds_ramp(a_x=None, a_y=None, a_date=None): + if a_x is None: + a_min = -np.inf + a_max = np.inf + else: + #a_min = np.min(a_x) + nfirst_last = int(np.ceil(0.15 * a_x.size)) + a_min = a_x[nfirst_last] + a_max = a_x[-nfirst_last] + #a_min = np.percentile(a_x, 15) + #a_max = np.percentile(a_x,85) + if a_y is None: + b_min = -np.inf + b_max = np.inf + else: + # df = pd.DataFrame({'b': a_y}) # TODO: FILTER A_Y BY 20-80 PERCENTILE IN A_X + # #max_diff2 = np.max(df.diff().diff().abs()) + # max_diff2 = np.max(np.abs(np.diff(np.diff(a_y)))) + # + # b_min = -2*max_diff2 + # b_max = 2*max_diff2 + + b_min = -np.inf + b_max = np.inf + # logger_info('DEBUG: BOUNDS:',(a_min, b_min,a_max, b_max)) + return ([a_min, b_min], [a_max, b_max]) + + +model_ramp = ForecastModel('ramp', 2, _f_ramp, _f_init_params_ramp, _f_init_bounds_ramp) + + +# - Weekday seasonality + +def _f_model_season_wday(a_x, a_date, params, is_mult=False, **kwargs): + # Weekday seasonality model, 6 params + params_long = np.concatenate([[float(is_mult)], params]) # params_long[0] is default series value, + return params_long[a_date.weekday] + + +model_season_wday = ForecastModel('season_wday', 6, _f_model_season_wday) + + +# - Month seasonality +def _f_init_params_season_month(a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None or a_date is None: + return np.random.uniform(low=-1, high=1, size=11) + else: # TODO: Improve this + l_params_long = [np.mean(a_y[a_date.month==i]) for i in np.arange(1,13)] + l_baseline = l_params_long[-1] + l_params = l_params_long[:-1] + if not is_mult: + l_params_add = l_params-l_baseline + return l_params_add + else: + l_params_mult = l_params/l_baseline + return l_params_mult + +def _f_model_season_month(a_x, a_date, params, is_mult=False, **kwargs): + # Month of December is taken as default level, has no parameter + params_long = np.concatenate([[float(is_mult)], params]) # params_long[0] is default series value, + return params_long[a_date.month-1] + +model_season_month = ForecastModel('season_month', 11, _f_model_season_month, _f_init_params_season_month) + +model_season_month_old = ForecastModel('season_month_old', 11, _f_model_season_month) + + +def _f_model_yearly_season_fourier(a_x, a_date, params, is_mult=False, **kwargs): + # Infer the time series frequency to calculate the Fourier parameters + + period = dict_fourier['period'] + harmonics = dict_fourier['harmonics'] + + return _f_model_season_fourier(a_date, params, period, harmonics, is_mult) + + +date_origin = pd.datetime(1970, 1, 1) + + +def _f_model_season_fourier(a_date, params, period, harmonics, is_mult=False): + # convert to days since epoch + t = (a_date - date_origin).days.values + i = np.arange(1,harmonics+1) + a_tmp = i.reshape(i.size,1)* t + k = (2.0 * np.pi / period) + y = np.concatenate([np.sin(k*a_tmp), np.cos(k*a_tmp)]) + + # now multiply by the params + y = np.matmul(params, y) + return y + + +def _f_init_params_fourier_n_params(n_params, a_x=None, a_y=None, a_date=None, is_mult=False): + if a_y is None: + params = np.random.uniform(0.001, 1, n_params) + else: + # max difference in time series + diff = a_y.max() - a_y.min() + params = diff * np.random.uniform(0.001, 1, n_params) + return params + + +def _f_init_params_fourier(a_x=None, a_y=None, a_date=None, is_mult=False): + n_params = 2 * dict_fourier['harmonics'] + return _f_init_params_fourier_n_params( + n_params, a_x=a_x, a_y=a_y, a_date=a_date, is_mult=is_mult) + + +def _f_init_bounds_fourier_nparams(n_params, a_x=None, a_y=None, a_date=None): + if a_y is None: + return n_params * [-np.inf], n_params * [np.inf] + if a_y.ndim > 1: + a_y = a_y[:, 0] + + diff = a_y.max() - a_y.min() + return n_params * [-2 * diff], n_params * [2 * diff] + + +def _f_init_bounds_fourier_yearly(a_x=None, a_y=None, a_date=None): + n_params = 2 * dict_fourier['harmonics'] + return _f_init_bounds_fourier_nparams(n_params, a_x, a_y, a_date) + + +model_season_fourier_yearly = ForecastModel( + name='season_fourier_yearly', + n_params=2 * dict_fourier['harmonics'], + f_model=_f_model_yearly_season_fourier, + f_init_params=_f_init_params_fourier, + f_bounds=_f_init_bounds_fourier_yearly) + + +def get_fixed_model(forecast_model, params_fixed, is_mult=False): + if len(params_fixed) != forecast_model.n_params: + err = 'Wrong number of fixed parameters' + raise ValueError(err) + return ForecastModel(forecast_model.name + '_fixed', 0, + f_model=lambda a_x, a_date, params, is_mult=is_mult, **kwargs: + forecast_model.f_model( + a_x=a_x, a_date=a_date, params=params_fixed, is_mult=is_mult)) + + +def get_iqr_thresholds(s_diff, low=0.25, high=0.75): + # Get thresholds based on inter quantile range + q1 = s_diff.quantile(low) + q3 = s_diff.quantile(high) + iqr = q3 - q1 + thr_low = q1 - 1.5 * iqr + thr_hi = q3 + 1.5 * iqr + return thr_low, thr_hi + +def get_model_outliers_withgap(df, window=3): + # TODO: ADD CHECK, TO PREVENT REDUNDANT OPS IN DF WITHOUT GAPS + + df_nogap = df.pipe(model_utils.interpolate_df, include_mask=True) + mask_step, mask_spike = get_model_outliers(df_nogap) + + ## TODO: FOR EACH OF MASK STEP, MASK SPIKE, IF IT IS NONE, RETURN NONE + if mask_spike is None and mask_step is None: + return None,None + if mask_spike is not None: + df_nogap['mask_spike'] = mask_spike + if mask_step is not None: + df_nogap['mask_step'] = mask_step + df_nogap['step_in_filled_gap'] = df_nogap.mask_step * df_nogap.is_gap_filled + df_nogap['mask_step_patch'] = df_nogap.step_in_filled_gap.shift(-1).fillna(0) + + df_nogap = df_nogap.loc[~df_nogap.is_gap_filled] + + if mask_step is not None: + df_nogap['mask_step_patch'] = df_nogap.mask_step_patch.shift(1).fillna(0) + df_nogap['mask_step'] = df_nogap.mask_step + df_nogap.mask_step_patch + + logger_info('df 1 - no gap:', df_nogap) + + if mask_step is not None: + mask_step = df_nogap.mask_step.values + if mask_spike is not None: + mask_spike = df_nogap.mask_spike.values + return mask_step, mask_spike + + # todo - clean up, return + + + +# TODO: Add option - estimate_outl_size +# TODO: Add option - sigmoid steps +# TODO: ADD option - gaussian spikes +def get_model_outliers(df, window=3): + """ + + :param df: + :type df: + :param window: + :type window: + :return: + :rtype: + Note: due to the way the thresholds are defined, we require 6+ samples in series to find a spike. + """ + is_mult = False + + dfo = df.copy() # dfo - df for outliers + with_dates = 'date' in df.columns # If df has datetime index, use date logic in steps/spikes + x_col = 'date' if with_dates else 'x' + + if df[x_col].duplicated().any(): + raise ValueError('Input cannot have multiple values per sample') + + # logger_info('debug 0 :', dfo) + + dfo['dif'] = dfo.y.diff() # .fillna(0) + + # TODO: If df has weight column, use only samples with weight=1 for IQR + + thr_low, thr_hi = get_iqr_thresholds(dfo.dif) + # Identify changes of state when diff value exceeds thresholds + dfo['ischange'] = ((dfo.dif < thr_low) | (dfo.dif > thr_hi)).astype(int) + + dfo['ischange_group'] = ( + (dfo.ischange) + .rolling(window, win_type=None, center=True).max() + .fillna(0).astype(int) + ) + + dfo['dif_filt'] = (dfo.dif * dfo.ischange).fillna(0) + dfo['dif_filt_abs'] = dfo.dif_filt.abs() + + dfo['ischange_cumsum'] = dfo.ischange.cumsum() + dfo['change_group'] = dfo.ischange_group.diff().abs().fillna(0).astype(int).cumsum() + + df_mean_gdiff = ( + dfo.loc[dfo.ischange.astype(bool)].groupby('change_group')['dif_filt'].mean() + .rename('mean_group_diff').reset_index() + ) + + df_mean_gdiff_abs = ( + dfo.loc[dfo.ischange.astype(bool)].groupby('change_group')['dif_filt_abs'].mean() + .rename('mean_group_diff_abs').reset_index() + ) + + dfo = dfo.merge(df_mean_gdiff, how='left').merge(df_mean_gdiff_abs, how='left') + dfo.mean_group_diff = dfo.mean_group_diff.fillna(0) + dfo.mean_group_diff_abs = dfo.mean_group_diff_abs.fillna(0) + + dfo['is_step'] = (dfo.mean_group_diff < thr_low) | (dfo.mean_group_diff > thr_hi) + dfo['is_spike'] = (dfo.mean_group_diff_abs - dfo.mean_group_diff) > (thr_hi - thr_low) / 2 + dfo['ischange_cumsum'] = dfo.ischange.cumsum() + + # logger_info('DF_OUTL: ',dfo) + + df_outl = ( + dfo.loc[dfo.ischange.astype(bool)].groupby('change_group') + .apply(lambda x:pd.Series({'outl_start':x.head(1)[x_col].iloc[0],'outl_end':x.tail(1)[x_col].iloc[0]})) + .reset_index() + ) + + if df_outl.empty: # No outliers - nothing to do + return None, None + + df_outl = df_outl.merge(dfo[['change_group', 'is_spike', 'is_step']].drop_duplicates()) + + dfo = dfo.merge(df_outl, how='left') + dfo['outl_start'] = dfo.outl_start.fillna(0).astype(int) + dfo['outl_end'] = dfo.outl_end.fillna(0).astype(int) + + dfo = dfo # .reset_index() + + df_spikes = df_outl.loc[df_outl.is_spike] + df_steps = df_outl.loc[df_outl.is_step] + + l_model_outl = [] + l_mask_step = [] + l_mask_spike = [] + + for g in df_spikes.change_group: + s_spike = df_spikes.loc[df_spikes.change_group == g].iloc[0] + if with_dates: + mask_spike_tmp = ~((dfo.date>=pd.to_datetime(s_spike.outl_start)) & + (dfo.date=s_spike.outl_start) & + (dfo.x.values Q3 + 3 * IQR + # Q1 and Q3 are the 25th (1st) and 75th (3rd) quartiles, and + # IQR is the inter-quartile range + q1 = df['diff'].quantile(0.25) + q3 = df['diff'].quantile(0.75) + iqr = q3 - q1 + low_thresh = q1 - 1.5 * iqr + high_thresh = q3 + 1.5 * iqr + + # df['is_change'] = 0 + step_filt = (df['diff'] < low_thresh) | (df['diff'] > high_thresh) + df['is_change'] = step_filt.astype(int) + + if not any(step_filt): + return [], [] + + # df.loc[step_filt, 'is_change'] = 1 + + # Now that we have found the outliers in differences, + # group consecutive steps together + + # get only the diffs that correspond to changes + df['diff'] = df['diff'] * df['is_change'] + df[['diff_sum', 'change_sum']] = df[['diff', 'is_change']].rolling( + window, win_type=None, center=True).sum() + + # we have steps, we may need to aggregate + # We split the array with zeros. + # This means that we treat all nearby changes as one, + # within `window` values + split = np.split(df['change_sum'], + np.where(df['change_sum'] == 0.)[0]) + # get rid of zero only series + split = [i for i in split if i.any()] + + # Now we have a list of series with the changes + changes_list = [] + for s in split: + change_s = df.iloc[s.index] + change_max_occur = change_s[change_s.is_change == 1].index.max() + change_min_occur = change_s[change_s.is_change == 1].index.min() + diff = change_s['diff'].sum() + duration = change_max_occur - change_min_occur + + if low_thresh <= diff <= high_thresh: # Change is a spike + change_type = 'spike' + # we keep the starting point as x + x = change_min_occur + # get the average change for the values of the change that + # are in the changing threshold + diff = change_s.loc[(change_s.is_change == 1) & + ((change_s['diff'] < low_thresh) | + (change_s['diff'] > high_thresh)), 'diff'].abs().mean() + + else: # Change is a step + # here we have a different starting point + x = (change_max_occur + change_min_occur - 1) / 2.0 + change_type = 'step' + + d = {'change_type': change_type, + 'duration': duration, + 'diff': diff, + 'x': x} + changes_list += [d] + + # Sort by absolute difference, in descending order + sorted_changes_list = sorted(changes_list, key=lambda ch: abs(ch['diff']), + reverse=True) + + # Rule of thumb: the maximum number of changes + # is the square root of the time series length + max_max_changes = int(np.floor(np.sqrt(len(a_y)))) + # If we have a max_changes input value, select the ones with higher diff + if (not max_changes) or (max_changes > max_max_changes): + max_changes = max_max_changes + + changes_list = sorted_changes_list[:max_changes] + + steps = [] + spikes = [] + for c in changes_list: + # get models + if c['change_type'] == 'spike': + # spike = create_fixed_spike(c['diff'], x=c['x'], + # duration=c['duration']) + spike = create_fixed_spike_ignored(x=c['x'], + duration=c['duration']) + spikes += [spike] + elif c['change_type'] == 'step': + step = create_fixed_step(diff=c['diff'], x=c['x']) + steps += [step] + else: + raise ValueError('Invalid change type: ' + c['change_type']) + + return steps, spikes + + +def create_fixed_step(diff, x): + fixed_params = [x, diff] + return get_fixed_model(model_step, fixed_params) + + +def create_fixed_spike(diff, x, duration): + fixed_params = [diff, x, x + duration] + return get_fixed_model(model_spike, fixed_params) + + +def create_fixed_spike_ignored(x, duration): + fixed_params = [0, x, x + duration] + return get_fixed_model(model_spike, fixed_params, is_mult=True) + + +# Dummy variable models + +def get_model_dummy(name, dummy, **kwargs): + """ + Generate a model based on a dummy variable. + + :param name: + :type name: + :param dummy: + | Can be a function or a list-like. + | If a function, it must be of the form f_dummy(a_x, a_date), and return a numpy array of floats + | with the same length as a_x and values that are either 0 or 1. + | If a list-like of numerics, it will be converted to a f_dummy function as described above, which will + | have values of 1 when a_x has one of the values in the list, and 0 otherwise. + | If a list-like of date-likes, it will be converted to a f_dummy function as described above, which will + | have values of 1 when a_date has one of the values in the list, and 0 otherwise. + :type dummy: function, or list-like of numerics or datetime-likes + :param kwargs: + :type kwargs: + :return: + | A model that returns A when dummy is 1, and 0 (or 1 if is_mult==True) otherwise. + :rtype: ForecastModel + + + """ + return ForecastModel(name, 1, get_f_model_dummy(dummy), **kwargs) + + +def _validate_f_dummy(f_dummy): + # Ensures that behaviour of f_dummy matches specs + # Must return array of floats, same length as a_x, with values either 0. or 1. + def validate_for_dummy(a_dummy): + assert isinstance(a_dummy, np.ndarray) + assert (np.setdiff1d(a_dummy, np.array([0., 1.])).size) == 0 + + # validate_for_dummy(f_dummy(np.arange(0, 10), None)) # Crashes with f_dummy 's that require dates + validate_for_dummy(f_dummy(np.arange(0, 10), pd.date_range('2018-01-01', '2018-01-10'))) + + +def get_f_model_dummy(dummy): + """ + Generate a model function for a dummy variable defined by f_dummy + + :param dummy: + :type dummy: function or list-like of numerics or dates + :return: model function based on dummy variable, to use on a ForecastModel + :rtype: function + """ + + if callable(dummy): # If dummy is a function, use it + f_dummy = dummy + else: + f_dummy = get_f_dummy_from_list(dummy) # If dummy is a list, convert to function + + _validate_f_dummy(f_dummy) + + def f_model_check(a_x, a_date, params, is_mult=False, **kwargs): + # Uses internal f_check to assign 0 or 1 to each sample + # If f_dummy(x)==1, return A + # If f_dummy(x)==0, return 0 (or 1 if is_mult) + [A] = params + mask = f_dummy(a_x, a_date) + if not is_mult: + a_result = A * mask + else: + a_result = (A - 1.) * mask + 1 + return a_result + + return f_model_check + + +def get_f_dummy_from_list(list_check): + """ + Generate a f_dummy function that defines a dummy variable, can be used for dummy models + + :param list_check: Input list + :type list_check: list-like of numerics or datetime-likes + :return: f_dummy + :rtype: function + """ + # Generate a f_dummy function that defines a dummy variable, can be used for dummy models + s_check = pd.Series(list_check) + if pd.api.types.is_numeric_dtype(s_check): + list_check_numeric = s_check + + def f_dummy_list_numeric(a_x, a_date): + # return a_x in check_numeric + return np.isin(a_x, list_check_numeric).astype(float) + + return f_dummy_list_numeric + else: + try: + list_check_date = pd.to_datetime(s_check) + + def f_dummy_list_date(a_x, a_date): + # return a_x in check_numeric + return np.isin(a_date, list_check_date).astype(float) + + return f_dummy_list_date + except: + raise ValueError('list_dummy must be a list-like with numeric or date-like values: %s', list_check) + + +model_season_wday_2 = get_model_dummy('season_wday_2', lambda a_x, a_date, **kwargs: (a_date.weekday < 5).astype(float)) + +# Example dummy model - checks if it is Christmas +model_dummy_christmas = get_model_dummy('dummy_christmas', + lambda a_x, a_date, **kwargs: ((a_date.month == 12) & (a_date.day == 25)).astype(float)) + +# Example dummy model - checks if it is first day of month +model_dummy_month_start = get_model_dummy('dummy_month_start', + lambda a_x, a_date, **kwargs: (a_date.day == 1).astype(float)) + + +# Utility functions + +def fix_params_fmodel(forecast_model, l_params_fixed): + """ + Given a forecast model and a list of floats, modify the model so that some of its parameters become fixed + + :param forecast_model: + :type forecast_model: + :param l_params_fixed: List of floats with same length as number of parameters in model. For each element, a + non-null value means that the parameter in that position is fixed to that value. A null value means that + the parameter in that position is not fixed. + :type l_params_fixed: list + :return: A forecast model with a number of parameters equal to the number of null values in l_params_fixed, + with f_model modified so that some of its parameters gain fixed values equal to the non-null values in l_params + :rtype: + """ + assert len(l_params_fixed) == forecast_model.n_params + + l_params_fixed = np.array(l_params_fixed) + + a_null = np.isnan(l_params_fixed) + i_null = np.nonzero(a_null) + + name = '{}_fixed_{}'.format(forecast_model.name, str(l_params_fixed).replace('nan', ':') + ) + n_params = len(i_null[0]) + + def f_model_fixed(a_x, a_date, params, is_mult=False, **kwargs): + params_long = l_params_fixed + params_long[i_null] = params + return forecast_model.f_model(a_x, a_date, params_long, is_mult) + + def f_init_params_fixed(a_x=None, a_y=None, a_date=None, is_mult=False): + # return params short + params_init = forecast_model.f_init_params(a_x, a_y, a_date, is_mult) + params_init_short = np.array(params_init)[i_null] + return params_init_short + + def f_bounds_fixed(a_x=None, a_y=None, a_date=None): + # return f_bounds short + bounds_min, bounds_max = forecast_model.f_bounds(a_x, a_y, a_date) + bounds_min_short = np.array(bounds_min)[i_null] + bounds_max_short = np.array(bounds_max)[i_null] + return bounds_min_short, bounds_max_short + + model_result = ForecastModel(name, n_params, f_model_fixed, f_init_params_fixed, f_bounds_fixed) + return model_result + + +def simplify_model(f_model, a_x=None, a_y=None, a_date=None): + """ + Check a model's bounds, and update model to make parameters fixed if their min and max bounds are equal + + :param f_model: + :type f_model: + :param a_x: + :type a_x: + :param a_y: + :type a_y: + :param a_date: + :type a_date: + :return: + :rtype: + """ + bounds_min, bounds_max = f_model.f_bounds(a_x, a_y, a_date) + bounds_diff = np.array(bounds_max) - np.array(bounds_min) + i_diff_zero = np.nonzero(bounds_diff == 0) + # For any parameter, if bounds_min == bounds_max, that parameter becomes fixed + + if i_diff_zero[0].size == 0: + return f_model + else: # We make parameters fixed if their min and max bounds are equal + params_fixed = np.full(f_model.n_params, np.NaN) + params_fixed[i_diff_zero, ] = bounds_max[i_diff_zero, ] + f_model = fix_params_fmodel(f_model, params_fixed) + logger.info('Some min and max bounds are equal - generating fixed model: %s', f_model.name) + return f_model + + +def validate_initial_guess(initial_guess, bounds): + initial_guess = np.array(initial_guess) + bounds_min, bounds_max = bounds + return np.all((initial_guess >= bounds_min) & (initial_guess <= bounds_max)) + + +def get_l_model_auto_season(a_date, min_periods=1.5, season_add_mult='add', + l_season_yearly=None, l_season_weekly=None): + """ + Generates a list of candidate seasonality models for an series of timestamps + + :param a_date: + :type a_date: + :param min_periods: + :type min_periods: + :param is_mult: + :type is_mult: + :return: + :rtype: + """ + s_date = pd.Series(a_date).sort_values().drop_duplicates() + min_date_delta = s_date.diff().min() + max_date_delta = s_date.max() - s_date.min() + + if pd.isna(min_date_delta) or pd.isna(max_date_delta): + return [model_null] + + use_season_yearly = ( + (max_date_delta > pd.Timedelta(min_periods * 365, unit='d')) & # Need more than a full year + (min_date_delta <= pd.Timedelta(92, unit='d')) # Need at least quarterly samples + ) + + use_season_weekly = ( + (max_date_delta > pd.Timedelta(min_periods * 7, unit='d')) & # Need more than a full week + (min_date_delta <= pd.Timedelta(1, unit='d')) # Need at least daily samples + ) + + l_season_yearly_default = [ + # model_season_month, + model_season_fourier_yearly, + model_null] if l_season_yearly is None else l_season_yearly + l_season_weekly_default = [model_season_wday, model_null] if l_season_weekly is None else l_season_weekly + + if use_season_weekly: + l_season_weekly = l_season_weekly_default + else: + l_season_weekly = [model_null] + + if use_season_yearly: + l_season_yearly = l_season_yearly_default + # TODO: add season_yearly_fourier + # TODO: add holiday list + else: + l_season_yearly = [model_null] + + l_result = [model_null] + for s_w, s_y in itertools.product(l_season_weekly, l_season_yearly): + + model_season_add = s_w + s_y + model_season_mult = s_w * s_y + + if season_add_mult in ['add'] and model_season_add != model_null: + l_result += [model_season_add] + if season_add_mult in ['mult'] and model_season_mult != model_null and \ + model_season_mult not in l_result: + l_result += [model_season_mult] + + return l_result + + """ + todo: RENAME SEASON MODELS + - season_yearly_month, season_yearly_fourier, season_yearly_quarter + - season_weekly_wday, season_weekly_wkd + + """ diff --git a/anticipy/forecast_plot.py b/anticipy/forecast_plot.py new file mode 100644 index 0000000..de61920 --- /dev/null +++ b/anticipy/forecast_plot.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +# +# License: This module is released under the terms of the LICENSE file +# contained within this applications INSTALL directory + +""" + __high_level_module_description_here__ +""" + +# -- Coding Conventions +# http://www.python.org/dev/peps/pep-0008/ - Use the Python style guide +# http://sphinx.pocoo.org/rest.html - Use Restructured Text for docstrings + +# -- Public Imports +from tempfile import NamedTemporaryFile +import os +import matplotlib.pyplot as plt +import logging +import numpy as np + +# -- Globals +logger = logging.getLogger(__name__) + +# ---- R Globals + +r_utils = ( + """ + require(scales) + require(stringr) + + get_label_f = function(div=1, mult=1, curr='',unit='', digits=1){ # ... includes digits parameter, passed to string format() + # Returns formatting functions for scale labels + function(x, ...) { + paste0(curr, format(x*mult/div, digits=digits, ..., big.mark = ",", scientific = FALSE, trim = TRUE),unit) %>% + str_replace(paste0(curr,'-'),paste0('-',curr)) + } + } + # Scale for thousands of units + s_y_k = scale_y_continuous(labels=get_label_f(div=1000, unit='k', digits=1)) + # Scale for millions of units + s_y_m = scale_y_continuous(labels=get_label_f(div=10^6, unit='M', digits=4)) + """) + + +# -- Functions + +# ----- Utility functions +def logger_info(msg, data): + # Convenience function for easier log typing + logger.info(msg + '\n%s', data) + + +def df_string_to_unicode(df): + # In a dataframe, convert any string columns to unicode strings + df = df.copy() + columns_str = df.dtypes == basestring + if not columns_str.any(): + return df + for col in df.columns[columns_str]: + df[col] = df[col].astype('unicode') + return df + + +def to_feather(df, file_path): + # Save dataframe as feather file. Formats strings on unicode, for compatibility with R. Drops index. + df.reset_index(drop=True).pipe(df_string_to_unicode).to_feather(file_path) + + +def pix_to_in(width_px=None, height_px=None, dpi=300): + # Utility function to use pixel dimensions rather than ggplot's physical dims + dpi = float(dpi) + + width_in = width_px / dpi if width_px is not None else np.NaN + height_in = height_px / dpi if height_px is not None else np.NaN + # print width_in, height_in + return width_in, height_in + + +def has_pi (df_fcast): + return 'q5' in df_fcast.columns + +# ---- Plotting functions + + +def _plot_forecast_create(df_fcast, width=None, height=None, title=None, dpi=70, col_name_y='y', + col_name_source='source', col_name_date='date', col_name_model='model', scale=None): + """ + Creates ggplot object from forecast dataframe + + :param df_fcast: + | Forecast Dataframe with the following columns: + | - date (timestamp) + | - model (str) : ID for the forecast model + | - y (float) : Value of the time series in that sample + | - is_actuals (bool) : True for actuals samples, False for forecasted samples + :type df_fcast: pandas.DataFrame + :param title: Plot title + :type title: str + :param scale: Scale of y axis: If 'k', show thousands, and if 'M', show millions + :type scale: str + :return: The plot + :rtype: matplotlib plot instance + """ + # Default palette from ggplot + act_col = '#00BFC4' + for_col = '#F8766D' + plt.style.use('ggplot') + figsize = (width / dpi, height / dpi) + + # Clean actuals - weights do not get plotted + df_fcast = df_fcast.loc[df_fcast.model != 'weight'] + + # create the DatetimeIndex + df_fcast = df_fcast.set_index('date') + + if 'source' in df_fcast.columns: + just_one = False + sources = df_fcast.loc[df_fcast['is_actuals'], 'source'].unique() + num_plots = len(sources) + nrows = int(np.ceil(np.sqrt(num_plots))) + ncols = int(np.ceil(1. * num_plots / nrows)) + else: + # Only one set of actuals and forecast needed + just_one = True + sources = ['y'] + nrows = 1 + ncols = 1 + + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, dpi=dpi, squeeze=False) + fig.canvas.set_window_title(title) + + x = 0 + y = 0 + for src in sources: + ax = axes[x, y] + + # Filter the specific source is subplots + if just_one: + source_filt = True + else: + source_filt = df_fcast['source'] == src + + actuals, = ax.plot(df_fcast.loc[source_filt & df_fcast['is_actuals'], :].index, + df_fcast.loc[source_filt & df_fcast['is_actuals'], 'y'], + color=act_col, marker='o', linestyle='None', label='Actuals') + forecast, = ax.plot(df_fcast.loc[source_filt & ~df_fcast['is_actuals'], :].index, + df_fcast.loc[source_filt & ~df_fcast['is_actuals'], 'y'], + color=for_col, marker='None', linestyle='solid', label='Forecast') + + # Fill area between 5th and 95th prediction interval + if ('q5' in df_fcast.columns) and ('q95' in df_fcast.columns): + where_to_fill = (source_filt & + (~df_fcast['is_actuals']) & + (~df_fcast['q5'].isnull()) & + (~df_fcast['q95'].isnull())) + ax.fill_between(df_fcast.index, df_fcast['q5'], df_fcast['q95'], + where=where_to_fill, + facecolor=for_col, alpha=0.2) + + if ('q20' in df_fcast.columns) and ('q80' in df_fcast.columns): + # Fill area between 20th and 80th prediction interval + where_to_fill_2 = (source_filt & + (~df_fcast['is_actuals']) & + (~df_fcast['q20'].isnull()) & + (~df_fcast['q80'].isnull())) + ax.fill_between(df_fcast.index, df_fcast['q20'], df_fcast['q80'], + where=where_to_fill_2, + facecolor=for_col, alpha=0.2) + + if not just_one: + # Set the title of each subplot as per source name + ax.set_title(src) + + ax.legend(handles=[actuals, forecast], + labels=['Actuals', 'Forecast'], loc='upper left') + + y += 1 + if y >= ncols: + # New row + y = 0 + x += 1 + + # Now make the rest of the graphs invisible + while x < nrows: + while y < ncols: + axes[x, y].set_visible(False) + y += 1 + # New row + y = 0 + x += 1 + + return plt.Figure + + +def plot_forecast_save(df_fcast, file_path, width=None, height=None, title=None, dpi=70, col_name_y='y', + col_name_source='source', col_name_date='date', col_name_model='model', + scale=None, device='png', + transparent_bg=False): + """ + Generates matplotlib plot and saves as file + + :param df_fcast: + | Forecast Dataframe with the following columns: + | - date (timestamp) + | - model (str) : ID for the forecast model + | - y (float) : Value of the time series in that sample + | - is_actuals (bool) : True for actuals samples, False for forecasted samples + :type df_fcast: pandas.DataFrame + :param file_path: File path for output + :type file_path: str + :param width: Image width, in pixels + :type width: int + :param height: Image height, in pixels + :type height: int + :param title: Plot title + :type title: str + :param dpi: Image dpi + :type dpi: Image dpi + :param device: 'png' or 'pdf' + :type device: str + """ + + fig = _plot_forecast_create(df_fcast, width, height, title, dpi, col_name_y, col_name_source, + col_name_date, col_name_model, scale) + + dirname, fname = os.path.split(file_path) + if not os.path.exists(dirname): + logger.error('Path missing {}'.format(file_path)) + os.makedirs(dirname) + plt.savefig(file_path, dpi=dpi) + + +def plot_forecast(df_fcast, width=None, height=None, title=None, dpi=70, scale=None, device='png', + col_name_y='y', col_name_source='source', col_name_date='date', col_name_model='model', + transparent_bg=False): + """ + Generates plot and shows in an ipython notebook + + :param df_fcast: + | Forecast Dataframe with the following columns: + | - date (timestamp) + | - model (str) : ID for the forecast model + | - y (float) : Value of the time series in that sample + | - is_actuals (bool) : True for actuals samples, False for forecasted samples + :type df_fcast: pandas.DataFrame + :param width: Image width, in pixels + :type width: int + :param height: Image height, in pixels + :type height: int + :param title: Plot title + :type title: str + :param dpi: Image dpi + :type dpi: Image dpi + :return: Ipython image, to display in a notebook + :rtype: Ipython.display.Image + """ + try: + from IPython.display import Image + except ImportError: + logger.info('IPython not available, skipping...') + return None + + file_plot = NamedTemporaryFile() + plot_forecast_save(df_fcast, file_plot.name, width, height, title, dpi, scale, device, + col_name_y, col_name_source, col_name_date, col_name_model, transparent_bg) + return Image(filename=file_plot.name, format='png') diff --git a/anticipy/model_utils.py b/anticipy/model_utils.py new file mode 100644 index 0000000..90481cf --- /dev/null +++ b/anticipy/model_utils.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*- +# +# License: This module is released under the terms of the LICENSE file +# contained within this applications INSTALL directory + +""" +Utility functions for model generation +""" + +# -- Coding Conventions +# http://www.python.org/dev/peps/pep-0008/ - Use the Python style guide +# http://sphinx.pocoo.org/rest.html - Use Restructured Text for docstrings + +# -- Public Imports +import logging +import math +import numpy as np +import pandas as pd + +# -- Private Imports + +# -- Globals + + +logger = logging.getLogger(__name__) + +dict_wday_name = { + 0: 'W-MON', + 1: 'W-TUE', + 2: 'W-WED', + 3: 'W-THU', + 4: 'W-FRI', + 5: 'W-SAT', + 6: 'W-SUN', +} + + +# -- Exception classes + +# -- Functions +def logger_info(msg, data): + # Convenience function for easier log typing + logger.info(msg + '\n%s', data) + + +def array_transpose(a): + """ + Transpose a 1-D numpy array + + :param a: An array with shape (n,) + :type a: numpy.Array + :return: The original array, with shape (n,1) + :rtype: numpy.Array + """ + return a[np.newaxis, :].T + + +# TODO: rework to support model composition +def model_requires_scaling(model): + """ + Given a :py:class:`nsa.forecast.forecast_models.ForecastModel` return True if the function requires + scaling a_x + + :param model: A get_model_ function from :py:mod:`nsa.forecast.model.periodic_models` or + :py:mod:`nsa.forecast.model.aperiodic_models` + :type model: function + :return: True if function is logistic or sigmoidal + :rtype: bool + """ + requires_scaling = model is not None and model.name in [ + 'logistic', + 'sigmoid' + ] + return requires_scaling + + +def apply_a_x_scaling(a_x, model=None, scaling_factor=100.0): + """ + Modify a_x for forecast_models that require it + + :param a_x: x axis of time series + :type a_x: numpy array + :param model: a :py:class:`nsa.forecast.forecast_models.ForecastModel` + :type model: function or None + :param scaling_factor: Value used for scaling t_values for logistic models + :type scaling_factor: float + :return: a_x with scaling applied, if required + :rtype: numpy array + """ + if model_requires_scaling(model): # todo: check that this is still useful + a_x = a_x / scaling_factor + return a_x + + +dict_freq_units_per_year = {'A': 1.0, 'Y': 1.0, 'D': 365.0, 'W': 52.0, 'M': 12, 'Q': 4, 'H': 24 * 365.0} + + +def get_s_x_extrapolate(date_start_actuals, date_end_actuals, model=None, freq='W', extrapolate_years=2.5, + shifted_origin=0, scaling_factor=100.0, x_start_actuals=0.): + """ + Return t_values series with DateTimeIndex, covering the date range for the actuals, plus a forecast period. + + + :param date_start_actuals: date or numeric index for first actuals sample + :type date_start_actuals: str, datetime, int or float + :param date_end_actuals: date or numeric index for last actuals sample + :type date_end_actuals: str, datetime, int or float + :param extrapolate_years: + :type extrapolate_years: float + :param model: + :type model: function + :param freq: Time unit between samples. Supported units are 'W' for weekly samples, or 'D' for daily samples. + (untested) Any date unit or time unit accepted by numpy should also work, see + https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.datetime.html#arrays-dtypes-dateunits + :type freq: str or int + :param shifted_origin: Offset to apply to a_x + :type shifted_origin: int + :param scaling_factor: Value used for scaling a_x for certain model functions + :type scaling_factor: float + :return: Series of floats with DateTimeIndex. To be used as (a_date, a_x) input for a model function. + :rtype: pandas.Series + + The returned series covers the actuals time domain plus a forecast period lasting extrapolate_years, in years. + The number of additional samples for the forecast period is time_resolution * extrapolate_years, rounded down + """ + if isinstance(date_start_actuals, str) or isinstance(date_start_actuals, pd.datetime): # Use dates if available + date_start_actuals = pd.to_datetime(date_start_actuals) + date_end_actuals = pd.to_datetime(date_end_actuals) + + if freq is None: # Default frequency + freq='W' + + freq_short = freq[0:1] # Changes e.g. W-MON to W + # freq_units_per_year = 52.0 if freq_short=='W' else 365.0 # Todo: change to dict to support more frequencies + freq_units_per_year = dict_freq_units_per_year.get(freq_short, 365.0) + extrapolate_units = extrapolate_years*freq_units_per_year + date_end_forecast = date_end_actuals+pd.to_timedelta(extrapolate_units, unit=freq_short) + + index = pd.date_range(date_start_actuals, date_end_forecast, freq=freq, name='date') + else: # Otherwise, use numeric index - we extrapolate future samples equal to 100*extrapolate_years + index = pd.Index(np.arange(date_start_actuals, date_end_actuals+100*extrapolate_years)) + + s_x = pd.Series(index=index, data=np.arange(x_start_actuals, x_start_actuals+index.size))+shifted_origin + if model_requires_scaling(model): + s_x = s_x / scaling_factor + + return s_x + + +# Forecast Selection Functions + +def get_aic_c(fit_error, n, n_params): + """ + This function implements the corrected Akaike Information Criterion (AICc), taking as input + a given fit error and data/model degrees of freedom. We assume that the residuals of the candidate model + are distributed according to independent identical normal distributions with zero mean. Hence, we can use + define the AICc as + + .. math:: + + AICc = AIC + \\frac{2k(k+1)}{n-k-1} = 2k + n \\log\\left(\\frac{E}{n}\\right) + \\frac{2k(k+1)}{n-k-1}, + + where :math:`k` and :math:`n` denotes the model and data degrees of freedom respectively, and :math:`E` + denotes the residual error of the fit. + + :param fit_error: Residual error of the fit + :type fit_error: float + :param n: Data degrees of freedom + :type n: int + :param n_params: Model degrees of freedom + :type n_params: int + :return: Corrected Akaike Information Criterion (AICc) + :rtype: float + + Note: + + - see AIC in `Wikipedia article on the AIC `_. + + """ + # First, deal with corner cases that can blow things up with division by zero + if (n <= n_params + 1) or (n == 0): + aux = n - n_params - 1 + raise ValueError( + 'ERROR: Time series too short for AIC_C: (n = ' + str(n) + ', n - n_params - 1 = ' + str(aux) + ')') + elif fit_error == 0.0: + if n_params == 1: + aicc = -float("inf") + else: + # This can lead to suboptimal model selection when we have multiple perfect fits - we use a patch instead + # aicc = -float("inf") + fit_error = 10 ** -320 + aicc = n * math.log(fit_error / n) + 2 * n_params + (2 * n_params * (n_params + 1) / (n - n_params - 1)) + + else: + # Actual calculation of the AICc + aicc = n * math.log(fit_error / n) + 2 * n_params + (2 * n_params * (n_params + 1) / (n - n_params - 1)) + + # logger.info('DEBUG: getting aicc, fit_error: %s, n: %s, n_params: %s, aicc: %s', fit_error, n, n_params, aicc) + return aicc + + +def get_s_aic_c_best_result_key(s_aic_c): + # Required because aic_c can be -inf, that value is not compatible with pd.Series.argmin() + if s_aic_c.empty or s_aic_c.isnull().all(): + return None + if (s_aic_c.values == -np.inf).any(): + (key_best_result,) = (s_aic_c == -np.inf).nonzero() + key_best_result = s_aic_c.index[key_best_result.min()] + else: + key_best_result = s_aic_c.argmin() + return key_best_result + + +def detect_freq(a_date): + if isinstance(a_date, pd.DataFrame): + if 'date' not in a_date.columns: + return None + else: + a_date = a_date.date + s_date = pd.Series(a_date).sort_values().drop_duplicates() + min_date_delta = s_date.diff().min() + if pd.isnull(min_date_delta): + return None + elif min_date_delta == pd.Timedelta(1, unit='h'): + return 'H' + elif min_date_delta == pd.Timedelta(7, unit='D'): + # Weekly seasonality - need to determine day of week + min_date_wday = s_date.min().weekday() + return dict_wday_name.get(min_date_wday, 'W') + elif min_date_delta >= pd.Timedelta(28, unit='d') and \ + min_date_delta <= pd.Timedelta(31, unit='d'): + # MS is month start, M is month end. We use MS if all dates match first of month + if s_date.dt.day.max() == 1: + return 'MS' + else: + return 'M' + elif min_date_delta >= pd.Timedelta(89, unit='d') and \ + min_date_delta <= pd.Timedelta(92, unit='d'): + return 'Q' + elif min_date_delta >= pd.Timedelta(365, unit='d') and \ + min_date_delta <= pd.Timedelta(366, unit='d'): + # YS is month start, Y is month end. We use MS if all dates match first of month + if s_date.dt.day.max() == 1 and s_date.dt.month.max() == 1: + return 'YS' + else: + return 'Y' + elif min_date_delta >= pd.Timedelta(23, unit='h'): + #and min_date_delta <= pd.Timedelta(1, unit='d')\ + return 'D' + else: + return None + + +def interpolate_df(df, include_mask=False): + # In a dataframe with date gaps, replace gaps with interpolation + if not 'date' in df.columns: # interpolate by x column + if df.x.diff().nunique <=1: + return df + else: + df_result = ( + df.set_index('x') + .reindex(pd.RangeIndex(df.x.min(), df.x.max()+1, name='x')) + .interpolate() + .reset_index() + ) + + else: # df has date column - interpolate by date + s_date_diff = df.date.diff() + if s_date_diff.pipe(pd.isnull).all(): + s_date_diff_first = None + else: + s_date_diff_first = s_date_diff.loc[s_date_diff.first_valid_index()] + freq = detect_freq(df) + # If space between samples is constant, no interpolation is required + # Exception: in sparse series with date gaps, we can randomly get gaps that are constant but + # don't match any real period, e.g. 8 days + + if s_date_diff.nunique() <=1 and not (freq == 'D' and s_date_diff_first>pd.to_timedelta(1, 'day')): + # TODO: Add additional check for e.g. 2-sample series with 8-day gap + return df + df_result = ( + df.set_index('date') + .asfreq(freq) + .interpolate() + .reset_index() + ) + if 'x' in df.columns: + df_result['x'] = df_result['x'].astype(df.x.dtype) + if include_mask: + df_result['is_gap_filled'] = ~df_result.x.isin(df.x) + return df_result diff --git a/anticipy/utils_test.py b/anticipy/utils_test.py new file mode 100644 index 0000000..49c21bf --- /dev/null +++ b/anticipy/utils_test.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +# +# License: This module is released under the terms of the LICENSE file +# contained within this applications INSTALL directory + +""" + Class and functions to test pandas dataframes and series +""" + +# -- Coding Conventions +# http://www.python.org/dev/peps/pep-0008/ - Use the Python style guide +# http://sphinx.pocoo.org/rest.html - Use Restructured Text for docstrings + +# -- Public Imports +import unittest +import numpy as np +import pandas as pd +import pandas.util.testing as pdt +import logging + +# -- Globals +logger = logging.getLogger(__name__) + + +# -- Exception classes + +# -- Functions +def logger_info(msg, data): + # Convenience function for easier log typing + logger.info(msg + '\n%s', data) + + +def _is_dtype_categorical(x): + if type(x) is pd.DataFrame: + # Slightly faster than x.dtypes == 'category' + return x.dtypes.apply(lambda x: x.name == 'category') + else: + # Used because x.dtype =='category' doesn't always work + return x.dtype.name == 'category' + + +# -- Classes +class PandasTest(unittest.TestCase): + + def assert_frame_equal(self, left, right, ignore_index=False, compare_as_strings=False, + ignore_column_order=False, **kwargs): + """ + Checks that 2 dataframes are equal + + :param left: + :type left: + :param right: + :type right: + :param ignore_index: + :type ignore_index: + :param compare_as_strings: + :type compare_as_strings: + :param kwargs: + :type kwargs: + + """ + l = left + r = right + if ignore_index: + l = l.reset_index(drop=True) + r = r.reset_index(drop=True) + if compare_as_strings: + l = l.astype(str) + r = r.astype(str) + if ignore_column_order: + r = r.pdu_reorder(l.columns) + pdt.assert_frame_equal(l, r, **kwargs) + + def assert_frame_not_equal(self, left, right, ignore_index=False, **kwargs): + if ignore_index: + with self.assertRaises(AssertionError): + pdt.assert_frame_equal(left.reset_index(drop=True), right.reset_index(drop=True), **kwargs) + else: + with self.assertRaises(AssertionError): + pdt.assert_frame_equal(left, right, **kwargs) + + def assert_series_equal(self, left, right, ignore_index=False, compare_as_strings=False, ignore_name=True, + **kwargs): + """ + Checks that 2 series are equal + + :param left: + :type left: + :param right: + :type right: + :param ignore_index: + :type ignore_index: + :param compare_as_strings: + :type compare_as_strings: + :param kwargs: + :type kwargs: + """ + l = left + r = right + pdt._check_isinstance(l, r, pd.Series) + if ignore_index: + l = l.reset_index(drop=True) + r = r.reset_index(drop=True) + if compare_as_strings: + l = l.astype(str) + r = r.astype(str) + if ignore_name: + l = l.rename(None) + r = r.rename(None) + + if _is_dtype_categorical(l) or _is_dtype_categorical(r): + self.assertTrue(_is_dtype_categorical(l)) + self.assertTrue(_is_dtype_categorical(r)) + self.assertTrue(r.equals(l)) + self.assertEqual(l.cat.ordered, r.cat.ordered) + else: + pdt.assert_series_equal(l, r, **kwargs) + + def assert_array_equal(self, left, right): + np.testing.assert_array_equal(left, right) + +# -- Main diff --git a/setup.py b/setup.py index 0492f9e..07a52aa 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -__version__ = "0.0.1" +__version__ = "0.0.2" # -- Edit Start zip_safe = False @@ -8,13 +8,13 @@ modules = [] dependencies = [ - 'pandas>=0.20.3', + 'matplotlib>=2.2.3', 'numpy>=1.13.3', + 'pandas>=0.20.3', 'scipy>=1.0.0', ] extras_require={ -'r':['rpy2>=2.8.3'], } dependency_links = [ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..f5c41f0 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +__all__=[] \ No newline at end of file diff --git a/tests/data/candy_production.csv b/tests/data/candy_production.csv new file mode 100755 index 0000000..015b3f2 --- /dev/null +++ b/tests/data/candy_production.csv @@ -0,0 +1,549 @@ +observation_date,IPG3113N +1972-01-01,85.6945 +1972-02-01,71.8200 +1972-03-01,66.0229 +1972-04-01,64.5645 +1972-05-01,65.0100 +1972-06-01,67.6467 +1972-07-01,69.0429 +1972-08-01,70.8370 +1972-09-01,75.0462 +1972-10-01,106.9289 +1972-11-01,105.5962 +1972-12-01,105.9673 +1973-01-01,91.2997 +1973-02-01,77.2700 +1973-03-01,69.6110 +1973-04-01,70.2986 +1973-05-01,71.6822 +1973-06-01,74.8635 +1973-07-01,72.0464 +1973-08-01,73.1748 +1973-09-01,80.5915 +1973-10-01,102.9200 +1973-11-01,109.2524 +1973-12-01,105.2210 +1974-01-01,88.6985 +1974-02-01,83.6098 +1974-03-01,77.2300 +1974-04-01,67.3209 +1974-05-01,74.6196 +1974-06-01,79.5858 +1974-07-01,66.0568 +1974-08-01,71.1864 +1974-09-01,70.1750 +1974-10-01,99.2212 +1974-11-01,101.1201 +1974-12-01,86.8930 +1975-01-01,67.0117 +1975-02-01,52.6964 +1975-03-01,50.6689 +1975-04-01,59.7613 +1975-05-01,60.8277 +1975-06-01,63.3629 +1975-07-01,62.3089 +1975-08-01,66.9021 +1975-09-01,66.3200 +1975-10-01,96.3411 +1975-11-01,105.6285 +1975-12-01,102.1819 +1976-01-01,87.9578 +1976-02-01,75.1878 +1976-03-01,62.0101 +1976-04-01,64.4758 +1976-05-01,70.5454 +1976-06-01,68.2086 +1976-07-01,69.3122 +1976-08-01,71.5922 +1976-09-01,76.9073 +1976-10-01,107.9049 +1976-11-01,111.6584 +1976-12-01,113.9655 +1977-01-01,97.3515 +1977-02-01,90.0083 +1977-03-01,77.2871 +1977-04-01,76.0459 +1977-05-01,77.9316 +1977-06-01,78.3077 +1977-07-01,75.8701 +1977-08-01,78.1822 +1977-09-01,84.2727 +1977-10-01,109.2254 +1977-11-01,106.1656 +1977-12-01,113.0575 +1978-01-01,90.1141 +1978-02-01,80.4678 +1978-03-01,76.4640 +1978-04-01,77.4211 +1978-05-01,76.7081 +1978-06-01,78.1769 +1978-07-01,72.4653 +1978-08-01,75.9054 +1978-09-01,82.7320 +1978-10-01,105.0435 +1978-11-01,111.6915 +1978-12-01,114.0821 +1979-01-01,98.6382 +1979-02-01,84.7727 +1979-03-01,81.0653 +1979-04-01,77.1607 +1979-05-01,78.3780 +1979-06-01,81.0958 +1979-07-01,74.7939 +1979-08-01,77.1113 +1979-09-01,80.8078 +1979-10-01,101.0970 +1979-11-01,106.7263 +1979-12-01,105.6220 +1980-01-01,86.9268 +1980-02-01,84.4365 +1980-03-01,74.4834 +1980-04-01,65.5610 +1980-05-01,74.3631 +1980-06-01,76.9925 +1980-07-01,71.0376 +1980-08-01,77.2616 +1980-09-01,77.9510 +1980-10-01,100.8283 +1980-11-01,106.7109 +1980-12-01,107.0469 +1981-01-01,96.3481 +1981-02-01,90.4918 +1981-03-01,78.0943 +1981-04-01,78.0284 +1981-05-01,83.3531 +1981-06-01,83.0404 +1981-07-01,79.2798 +1981-08-01,81.7679 +1981-09-01,83.2954 +1981-10-01,118.4981 +1981-11-01,116.9605 +1981-12-01,113.2558 +1982-01-01,95.9863 +1982-02-01,92.9899 +1982-03-01,83.0765 +1982-04-01,73.5603 +1982-05-01,76.4383 +1982-06-01,78.5492 +1982-07-01,76.3145 +1982-08-01,77.7653 +1982-09-01,81.3017 +1982-10-01,114.1349 +1982-11-01,114.9389 +1982-12-01,115.1824 +1983-01-01,95.1877 +1983-02-01,87.1973 +1983-03-01,77.9717 +1983-04-01,73.7339 +1983-05-01,75.5696 +1983-06-01,74.7701 +1983-07-01,76.3340 +1983-08-01,79.5580 +1983-09-01,82.8953 +1983-10-01,110.4480 +1983-11-01,106.5100 +1983-12-01,103.9983 +1984-01-01,93.8437 +1984-02-01,86.3220 +1984-03-01,78.9029 +1984-04-01,75.6699 +1984-05-01,77.8830 +1984-06-01,77.6690 +1984-07-01,76.9080 +1984-08-01,81.2320 +1984-09-01,85.8844 +1984-10-01,112.1683 +1984-11-01,115.5118 +1984-12-01,112.8158 +1985-01-01,97.6849 +1985-02-01,87.1184 +1985-03-01,79.1429 +1985-04-01,76.2069 +1985-05-01,77.3304 +1985-06-01,75.8357 +1985-07-01,75.1953 +1985-08-01,79.9166 +1985-09-01,89.5288 +1985-10-01,112.2728 +1985-11-01,113.6916 +1985-12-01,117.1114 +1986-01-01,97.3994 +1986-02-01,93.6471 +1986-03-01,78.8262 +1986-04-01,73.6548 +1986-05-01,76.5236 +1986-06-01,76.7767 +1986-07-01,73.4034 +1986-08-01,79.5478 +1986-09-01,88.4485 +1986-10-01,115.9014 +1986-11-01,119.4066 +1986-12-01,115.4294 +1987-01-01,97.1736 +1987-02-01,94.2793 +1987-03-01,83.6225 +1987-04-01,77.3408 +1987-05-01,78.0336 +1987-06-01,79.1708 +1987-07-01,76.1298 +1987-08-01,83.5260 +1987-09-01,90.7704 +1987-10-01,121.6259 +1987-11-01,124.8565 +1987-12-01,122.6595 +1988-01-01,95.8055 +1988-02-01,95.3010 +1988-03-01,89.8740 +1988-04-01,80.8266 +1988-05-01,82.4593 +1988-06-01,86.7724 +1988-07-01,90.7579 +1988-08-01,98.0626 +1988-09-01,102.5171 +1988-10-01,125.7369 +1988-11-01,123.4990 +1988-12-01,122.4540 +1989-01-01,102.9508 +1989-02-01,102.3499 +1989-03-01,93.4219 +1989-04-01,88.7382 +1989-05-01,87.9183 +1989-06-01,90.5658 +1989-07-01,89.7340 +1989-08-01,96.5697 +1989-09-01,101.0261 +1989-10-01,120.0367 +1989-11-01,123.3104 +1989-12-01,125.9960 +1990-01-01,99.9894 +1990-02-01,101.2116 +1990-03-01,94.8477 +1990-04-01,88.4239 +1990-05-01,88.6775 +1990-06-01,92.7610 +1990-07-01,96.9885 +1990-08-01,102.3169 +1990-09-01,108.6388 +1990-10-01,124.4571 +1990-11-01,133.2020 +1990-12-01,134.4426 +1991-01-01,107.4831 +1991-02-01,111.4080 +1991-03-01,104.8112 +1991-04-01,96.0485 +1991-05-01,94.9222 +1991-06-01,102.6901 +1991-07-01,100.1583 +1991-08-01,109.7879 +1991-09-01,111.1361 +1991-10-01,124.0982 +1991-11-01,129.3138 +1991-12-01,124.9696 +1992-01-01,104.3101 +1992-02-01,102.7870 +1992-03-01,94.9205 +1992-04-01,92.0467 +1992-05-01,89.7304 +1992-06-01,92.8576 +1992-07-01,92.1938 +1992-08-01,96.2302 +1992-09-01,104.1677 +1992-10-01,118.9880 +1992-11-01,122.1755 +1992-12-01,121.4803 +1993-01-01,105.0701 +1993-02-01,102.2842 +1993-03-01,94.8146 +1993-04-01,89.6044 +1993-05-01,88.4397 +1993-06-01,94.8144 +1993-07-01,95.7128 +1993-08-01,103.4214 +1993-09-01,108.6304 +1993-10-01,124.8315 +1993-11-01,124.8048 +1993-12-01,122.7720 +1994-01-01,105.3330 +1994-02-01,100.7475 +1994-03-01,98.4825 +1994-04-01,89.3258 +1994-05-01,87.0124 +1994-06-01,93.7943 +1994-07-01,96.4548 +1994-08-01,102.9823 +1994-09-01,109.7563 +1994-10-01,123.0683 +1994-11-01,123.1853 +1994-12-01,124.9834 +1995-01-01,107.7064 +1995-02-01,99.2227 +1995-03-01,96.1946 +1995-04-01,94.2656 +1995-05-01,93.5966 +1995-06-01,98.2886 +1995-07-01,98.2274 +1995-08-01,102.2729 +1995-09-01,107.2035 +1995-10-01,120.4112 +1995-11-01,123.8626 +1995-12-01,128.9061 +1996-01-01,104.1852 +1996-02-01,105.5477 +1996-03-01,102.9177 +1996-04-01,94.8080 +1996-05-01,97.0557 +1996-06-01,100.4093 +1996-07-01,98.2829 +1996-08-01,108.0978 +1996-09-01,114.7798 +1996-10-01,126.2366 +1996-11-01,133.8463 +1996-12-01,136.1510 +1997-01-01,110.6349 +1997-02-01,109.6545 +1997-03-01,106.5499 +1997-04-01,98.1605 +1997-05-01,97.3783 +1997-06-01,101.5750 +1997-07-01,98.3122 +1997-08-01,109.9673 +1997-09-01,115.0362 +1997-10-01,129.5071 +1997-11-01,135.1607 +1997-12-01,136.0268 +1998-01-01,119.7766 +1998-02-01,117.1886 +1998-03-01,110.8164 +1998-04-01,106.1647 +1998-05-01,107.1149 +1998-06-01,110.8432 +1998-07-01,109.0117 +1998-08-01,117.6771 +1998-09-01,120.9282 +1998-10-01,132.6661 +1998-11-01,136.9855 +1998-12-01,135.9605 +1999-01-01,117.3789 +1999-02-01,114.6903 +1999-03-01,107.1010 +1999-04-01,106.2725 +1999-05-01,107.8371 +1999-06-01,108.3356 +1999-07-01,107.8132 +1999-08-01,112.5035 +1999-09-01,116.6453 +1999-10-01,131.6485 +1999-11-01,131.7630 +1999-12-01,134.5654 +2000-01-01,123.1325 +2000-02-01,119.7423 +2000-03-01,113.9508 +2000-04-01,115.9481 +2000-05-01,108.7202 +2000-06-01,114.2071 +2000-07-01,111.8737 +2000-08-01,117.9027 +2000-09-01,125.6499 +2000-10-01,136.8146 +2000-11-01,135.6331 +2000-12-01,138.7040 +2001-01-01,122.5767 +2001-02-01,121.8879 +2001-03-01,118.5969 +2001-04-01,114.6967 +2001-05-01,112.9349 +2001-06-01,115.3333 +2001-07-01,113.4896 +2001-08-01,119.6772 +2001-09-01,123.5141 +2001-10-01,125.5298 +2001-11-01,128.7324 +2001-12-01,129.6597 +2002-01-01,117.5658 +2002-02-01,114.5385 +2002-03-01,110.3068 +2002-04-01,104.6927 +2002-05-01,101.8499 +2002-06-01,112.2162 +2002-07-01,110.4021 +2002-08-01,117.5309 +2002-09-01,119.4877 +2002-10-01,124.0385 +2002-11-01,128.5738 +2002-12-01,124.1789 +2003-01-01,113.0303 +2003-02-01,111.1786 +2003-03-01,111.2168 +2003-04-01,105.1536 +2003-05-01,107.6101 +2003-06-01,111.6628 +2003-07-01,102.8517 +2003-08-01,113.1477 +2003-09-01,116.8135 +2003-10-01,125.1860 +2003-11-01,132.7907 +2003-12-01,128.8211 +2004-01-01,116.1890 +2004-02-01,117.6700 +2004-03-01,103.9096 +2004-04-01,102.2036 +2004-05-01,109.8036 +2004-06-01,106.2960 +2004-07-01,106.1117 +2004-08-01,116.2320 +2004-09-01,120.7290 +2004-10-01,132.3043 +2004-11-01,133.1452 +2004-12-01,129.9987 +2005-01-01,124.5687 +2005-02-01,123.9260 +2005-03-01,107.2575 +2005-04-01,106.8015 +2005-05-01,111.4551 +2005-06-01,107.1940 +2005-07-01,110.2132 +2005-08-01,114.8196 +2005-09-01,119.7252 +2005-10-01,137.1695 +2005-11-01,136.3902 +2005-12-01,139.9153 +2006-01-01,118.2816 +2006-02-01,117.8165 +2006-03-01,108.4194 +2006-04-01,107.5783 +2006-05-01,101.9894 +2006-06-01,101.9425 +2006-07-01,101.7114 +2006-08-01,112.0216 +2006-09-01,118.6654 +2006-10-01,129.6397 +2006-11-01,130.3710 +2006-12-01,132.0261 +2007-01-01,121.7363 +2007-02-01,116.4986 +2007-03-01,112.6224 +2007-04-01,99.4400 +2007-05-01,98.0703 +2007-06-01,94.9320 +2007-07-01,91.3872 +2007-08-01,100.7496 +2007-09-01,110.1524 +2007-10-01,115.9774 +2007-11-01,118.4564 +2007-12-01,120.7117 +2008-01-01,108.7465 +2008-02-01,101.7820 +2008-03-01,97.2060 +2008-04-01,91.8637 +2008-05-01,88.9254 +2008-06-01,89.0084 +2008-07-01,85.1186 +2008-08-01,88.5622 +2008-09-01,103.2736 +2008-10-01,114.0601 +2008-11-01,115.8743 +2008-12-01,101.7672 +2009-01-01,89.9004 +2009-02-01,88.9836 +2009-03-01,85.5603 +2009-04-01,79.7102 +2009-05-01,80.2515 +2009-06-01,79.5651 +2009-07-01,82.3126 +2009-08-01,89.0494 +2009-09-01,101.1519 +2009-10-01,123.6728 +2009-11-01,117.0719 +2009-12-01,116.5435 +2010-01-01,100.3797 +2010-02-01,99.0155 +2010-03-01,91.9654 +2010-04-01,89.4914 +2010-05-01,89.9713 +2010-06-01,89.5047 +2010-07-01,96.4638 +2010-08-01,106.7689 +2010-09-01,115.8542 +2010-10-01,126.2773 +2010-11-01,117.7195 +2010-12-01,118.7519 +2011-01-01,103.0635 +2011-02-01,102.5548 +2011-03-01,98.9834 +2011-04-01,97.5274 +2011-05-01,91.3629 +2011-06-01,89.6899 +2011-07-01,89.6268 +2011-08-01,91.8899 +2011-09-01,93.9062 +2011-10-01,116.7634 +2011-11-01,116.8258 +2011-12-01,114.9563 +2012-01-01,99.9662 +2012-02-01,99.0417 +2012-03-01,94.1484 +2012-04-01,87.6950 +2012-05-01,85.3510 +2012-06-01,86.5815 +2012-07-01,89.5217 +2012-08-01,98.2967 +2012-09-01,112.2694 +2012-10-01,114.9091 +2012-11-01,116.0791 +2012-12-01,116.1401 +2013-01-01,107.0733 +2013-02-01,102.0263 +2013-03-01,102.6319 +2013-04-01,95.3206 +2013-05-01,91.7584 +2013-06-01,91.8125 +2013-07-01,92.4299 +2013-08-01,100.3593 +2013-09-01,105.5167 +2013-10-01,117.3458 +2013-11-01,121.6179 +2013-12-01,123.2412 +2014-01-01,104.5665 +2014-02-01,103.9509 +2014-03-01,101.0708 +2014-04-01,93.0044 +2014-05-01,88.4073 +2014-06-01,89.3661 +2014-07-01,88.0949 +2014-08-01,98.0799 +2014-09-01,106.8675 +2014-10-01,119.7665 +2014-11-01,129.0619 +2014-12-01,128.5528 +2015-01-01,109.9525 +2015-02-01,108.9073 +2015-03-01,106.5261 +2015-04-01,101.0631 +2015-05-01,96.7802 +2015-06-01,100.8339 +2015-07-01,102.8290 +2015-08-01,115.9030 +2015-09-01,115.8964 +2015-10-01,126.7440 +2015-11-01,124.5176 +2015-12-01,120.2374 +2016-01-01,108.5041 +2016-02-01,108.1308 +2016-03-01,107.9417 +2016-04-01,103.6179 +2016-05-01,102.0816 +2016-06-01,102.4044 +2016-07-01,102.9512 +2016-08-01,104.6977 +2016-09-01,109.3191 +2016-10-01,119.0502 +2016-11-01,116.8431 +2016-12-01,116.4535 +2017-01-01,109.4666 +2017-02-01,113.4661 +2017-03-01,105.2245 +2017-04-01,107.4288 +2017-05-01,101.9209 +2017-06-01,104.2022 +2017-07-01,102.5861 +2017-08-01,114.0613 diff --git a/tests/data/df_test_naive.csv b/tests/data/df_test_naive.csv new file mode 100644 index 0000000..d023ed4 --- /dev/null +++ b/tests/data/df_test_naive.csv @@ -0,0 +1,79 @@ +date,y +2017-01-29,0.4349205203965459 +2017-02-05,3.0939244374554384 +2017-02-12,3.570576295782257 +2017-02-19,3.563342711260862 +2017-02-26,6.385531690970693 +2017-03-05,7.1190596102394474 +2017-03-12,6.7996367386975365 +2017-03-19,6.835399190691371 +2017-03-26,7.108505735705831 +2017-04-02,7.057090295549315 +2017-04-09,6.7499156125551645 +2017-04-16,6.162630205418948 +2017-04-23,6.198688820948918 +2017-04-30,7.046710921413035 +2017-05-07,6.502776071560957 +2017-05-14,6.801603931195019 +2017-05-21,6.704787146284872 +2017-05-28,6.664921078362667 +2017-06-04,6.21757486453535 +2017-06-11,6.847429648511562 +2017-06-18,6.669050538105244 +2017-06-25,6.684271203224789 +2017-07-02,6.893301693784536 +2017-07-09,6.735451167048266 +2017-07-16,6.859307417620351 +2017-07-23,6.834707822624351 +2017-07-30,6.793872366111498 +2017-08-06,6.822959736610834 +2017-08-13,6.74675938333047 +2017-08-20,6.517230286438933 +2017-08-27,6.812962761898247 +2017-09-03,6.557346777379345 +2017-09-10,7.301625554518906 +2017-09-17,7.071100098663302 +2017-09-24,6.8941645008092305 +2017-10-01,7.170365759931321 +2017-10-08,6.842752321704603 +2017-10-15,7.190685255413404 +2017-10-22,7.1968926085980005 +2017-10-29,6.832378325837326 +2017-11-05,7.186430516910802 +2017-11-12,7.189336007245643 +2017-11-19,7.274336435808509 +2017-11-26,7.395746721528261 +2017-12-03,7.392821166940019 +2017-12-10,7.573502955309098 +2017-12-17,7.563499486808907 +2017-12-24,7.15602503364969 +2017-12-31,4.950378683070151 +2018-01-07,6.4613873176891286 +2018-01-14,7.23745630660972 +2018-01-21,7.413771393172441 +2018-01-28,7.290036969811424 +2018-02-04,7.330240616312069 +2018-02-11,7.257741037653408 +2018-02-18,6.943758719510568 +2018-02-25,7.155971064420781 +2018-03-04,8.466230408426785 +2018-03-11,7.452759660447607 +2018-03-18,7.435772673953147 +2018-03-25,7.390361816944939 +2018-04-01,6.86589826083204 +2018-04-08,6.662158701811143 +2018-04-15,7.165209780232458 +2018-04-22,7.120109845677651 +2018-04-29,7.402460513091829 +2018-05-06,7.381854799086771 +2018-05-13,6.774638820831432 +2018-05-20,7.074164419607107 +2018-05-27,7.170134647803297 +2018-06-03,6.536887800141038 +2018-06-10,7.2318692224535 +2018-06-17,7.202142719034337 +2018-06-24,6.926316852627774 +2018-07-01,7.072639779280713 +2018-07-08,7.097870985414842 +2018-07-15,6.9114185617021375 +2018-07-22,6.951945268558375 diff --git a/tests/data/df_test_naive2.csv b/tests/data/df_test_naive2.csv new file mode 100644 index 0000000..f5769bd --- /dev/null +++ b/tests/data/df_test_naive2.csv @@ -0,0 +1,58 @@ +date,source,x,y +2017-06-25,P-03-In,0,0.009199999894927336 +2017-07-02,P-03-In,1,0.009199999986959702 +2017-07-09,P-03-In,2,0.009200000000498108 +2017-07-16,P-03-In,3,0.009199999888299563 +2017-07-23,P-03-In,4,0.009200000010290943 +2017-07-30,P-03-In,5,0.009200000022657678 +2017-08-06,P-03-In,6,0.009199999984631519 +2017-08-13,P-03-In,7,0.00919999996280566 +2017-08-20,P-03-In,8,0.009199999994843337 +2017-08-27,P-03-In,9,0.009199999971220093 +2017-09-03,P-03-In,10,0.009200000030683091 +2017-09-10,P-03-In,11,0.009200000002552809 +2017-09-17,P-03-In,12,0.00919999995405611 +2017-09-24,P-03-In,13,0.009199999976381365 +2017-10-01,P-03-In,14,0.0092000000324407 +2017-10-08,P-03-In,15,0.009199999945804973 +2017-10-15,P-03-In,16,0.00919999997597581 +2017-10-22,P-03-In,17,0.009199999979020355 +2017-10-29,P-03-In,18,0.0092000000185485 +2017-11-05,P-03-In,19,0.009200000007130841 +2017-11-12,P-03-In,20,0.009199999989690455 +2017-11-19,P-03-In,21,0.009200000020362446 +2017-11-26,P-03-In,22,0.009200000028438795 +2017-12-03,P-03-In,23,0.009200000038151055 +2017-12-10,P-03-In,24,0.009199999937747427 +2017-12-17,P-03-In,25,0.00919999998062174 +2017-12-24,P-03-In,26,0.009200000003846808 +2017-12-31,P-03-In,27,0.009200000053336922 +2018-01-07,P-03-In,28,0.00919999997026424 +2018-01-14,P-03-In,29,0.009199999966684136 +2018-01-21,P-03-In,30,0.009200000040870715 +2018-01-28,P-03-In,31,0.00920000000311432 +2018-02-04,P-03-In,32,0.009199999981294036 +2018-02-11,P-03-In,33,0.009200000000277988 +2018-02-18,P-03-In,34,0.009199999959599527 +2018-02-25,P-03-In,35,0.009199999980314294 +2018-03-04,P-03-In,36,0.009199999973300394 +2018-03-11,P-03-In,37,0.009200000037236106 +2018-03-18,P-03-In,38,0.00920000001901414 +2018-03-25,P-03-In,39,0.00920000005450935 +2018-04-01,P-03-In,40,0.009200000019079686 +2018-04-08,P-03-In,41,0.009199999980682175 +2018-04-15,P-03-In,42,0.00919999991890617 +2018-04-22,P-03-In,43,0.009199999955175744 +2018-04-29,P-03-In,44,0.009199999971244061 +2018-05-06,P-03-In,45,0.05582210075845062 +2018-05-13,P-03-In,46,0.06648076233741707 +2018-05-20,P-03-In,47,0.06868591669154471 +2018-05-27,P-03-In,48,0.06702831154113441 +2018-06-03,P-03-In,49,0.06796620179936401 +2018-06-10,P-03-In,50,0.06790846507858844 +2018-06-17,P-03-In,51,0.06751835805547822 +2018-06-24,P-03-In,52,0.06994368168797657 +2018-07-01,P-03-In,53,0.06844987136927888 +2018-07-08,P-03-In,54,0.07056084702443666 +2018-07-15,P-03-In,55,0.06925717368373535 +2018-07-22,P-03-In,56,0.07054481867982176 diff --git a/tests/data/test_normalize.csv b/tests/data/test_normalize.csv new file mode 100644 index 0000000..fdeee3d --- /dev/null +++ b/tests/data/test_normalize.csv @@ -0,0 +1,45 @@ +date,y +2015-01-01,0 +2015-02-01,1 +2015-03-01,2 +2015-04-01,3 +2015-05-01,4 +2015-06-01,5 +2015-07-01,6 +2015-08-01,7 +2015-09-01,8 +2015-10-01,9 +2015-11-01,10 +2015-12-01,11 +2016-01-01,12 +2016-02-01,13 +2016-03-01,14 +2016-04-01,15 +2016-05-01,16 +2016-06-01,17 +2016-07-01,18 +2016-08-01,19 +2016-09-01,20 +2016-10-01,21 +2016-11-01,22 +2016-12-01,23 +2017-01-01,24 +2017-02-01,25 +2017-03-01,26 +2017-04-01,27 +2017-05-01,28 +2017-06-01,29 +2017-07-01,30 +2017-08-01,31 +2017-09-01,32 +2017-10-01,33 +2017-11-01,34 +2017-12-01,35 +2018-01-01,36 +2018-02-01,37 +2018-03-01,38 +2018-04-01,39 +2018-05-01,40 +2018-06-01,41 +2018-07-01,42 +2018-08-01,43 diff --git a/tests/test_forecast.py b/tests/test_forecast.py new file mode 100644 index 0000000..53333a7 --- /dev/null +++ b/tests/test_forecast.py @@ -0,0 +1,2513 @@ +""" + +Author: Pedro Capelastegui +Created on 04/12/2015 + +""" + +import platform +import os +import logging +import unittest +import pandas as pd, numpy as np + +from anticipy.model_utils import interpolate_df +from anticipy.utils_test import PandasTest +from anticipy.forecast import * + +# Dask dependencies - not currently used +# from dask import delayed +# from dask import compute +# from dask.distributed import Client +# from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler +# from dask.diagnostics import visualize + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def logger_info(msg, data): + logger.info(msg + '\n%s\n', data) + + +base_folder = os.path.join(os.path.dirname(__file__), 'data') + +pd.set_option('display.max_columns', 40) +pd.set_option('display.max_rows', 200) +pd.set_option('display.width', 1000) + + +def list_to_str(l): + if isinstance(l, list): + return str([str(i) for i in l]) + else: + return str(l) + +def array_ones_in_indices(n, l_indices): + return np.isin(np.arange(0, n), l_indices).astype(float) + +def array_zeros_in_indices(n, l_indices): + return (~np.isin(np.arange(0, n), l_indices)).astype(float) + +def print_forecast_driver_output(fcast_driver_output, log_first_line=None): + if fcast_driver_output.empty: + logger.info('Error: empty output') + else: + if log_first_line is not None: + log_first_line = '\r\n' + log_first_line + else: + log_first_line = '' + logger.info(log_first_line + '\r\nAIC_C:' + str(fcast_driver_output.dict_aic_c)) + # logger_info('AIC_C:',fcast_driver_output[0]) + +# usage: +# compute_prof(l_dict_result2_d, scheduler = 'processes', num_workers=4, title='Test figure') +def compute_prof(*args, **kwargs ): + with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof: + out = compute(*args, **kwargs) + visualize([prof, rprof,# cprof + ], show=True) + return out + + +class TestForecast(PandasTest): + def setUp(self): + pass + + def test_normalize_df(self): + + def run_test(df, df_expected, **kwargs): + df_out = normalize_df(df, **kwargs) + logger_info('df_out:', df_out.tail(10)) + self.assert_frame_equal(df_out, df_expected) + + a_y = np.full(10, 0.0) + a_x = np.arange(0, 10).astype(np.int64) + a_x2 = np.tile(np.arange(0, 5), 2).astype(np.int64) + a_x2_out = np.repeat(np.arange(0, 5), 2).astype(np.int64) + a_source = ['s1'] * 5 + ['s2'] * 5 + a_weight = np.full(10, 1.0) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + a_date2 = np.tile(pd.date_range('2014-01-01', periods=5, freq='D'), 2) + a_date2_out = np.repeat(pd.date_range('2014-01-01', periods=5, freq='D'), 2) + + logger_info('DEBUG: ', a_date2) + + # Test 0: Empty input + + self.assertIsNone(normalize_df(pd.DataFrame)) + + # Test 1: Output with x,y columns + df_expected = pd.DataFrame({'y': a_y, 'x': a_x, })[['x', 'y']] + + l_input = [ + [pd.DataFrame({'y': a_y}), {}], + [pd.DataFrame({'y': a_y, 'x': a_x}), {}], + [pd.DataFrame({'y_test': a_y, 'x_test': a_x}), {'col_name_y': 'y_test', 'col_name_x': 'x_test'}] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 2: Output with x,y,weight columns + df_expected = pd.DataFrame({'y': a_y, 'x': a_x, 'weight': a_weight})[['x', 'y', 'weight']] + + l_input = [ + [pd.DataFrame({'y': a_y, 'weight': a_weight}), {}], + [pd.DataFrame({'y': a_y, 'x': a_x, 'weight': a_weight}), {}], + [pd.DataFrame({'y_test': a_y, 'x_test': a_x, 'weight_test': a_weight}), + {'col_name_y': 'y_test', 'col_name_x': 'x_test', 'col_name_weight': 'weight_test'}] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 3: Output with x,y,weight,date columns + df_expected = pd.DataFrame({'y': a_y, 'x': a_x, 'weight': a_weight, 'date': a_date})[ + ['date', 'x', 'y', 'weight']] + + l_input = [ + [pd.DataFrame({'y': a_y, 'weight': a_weight, 'date': a_date}), {}], + [pd.DataFrame({'y': a_y, 'weight': a_weight}, index=a_date), {}], + [pd.DataFrame({'y': a_y, 'x': a_x, 'weight': a_weight, 'date': a_date}), {}], + [pd.DataFrame({'y_test': a_y, 'x_test': a_x, 'weight_test': a_weight, 'date_test': a_date}), + {'col_name_y': 'y_test', 'col_name_x': 'x_test', 'col_name_weight': 'weight_test', + 'col_name_date': 'date_test'}] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 4: Input series + df_expected = pd.DataFrame({'y': a_y, 'x': a_x, })[['x', 'y']] + + l_input = [ + [pd.Series(a_y, name='y'), {}], + [pd.Series(a_y, name='y', index=a_x), {}], + [pd.Series(a_y, name='y_test'), {'col_name_y': 'y_test'}], + # [pd.DataFrame({'y_test': a_y, 'x_test': a_x}), {'col_name_y':'y_test','col_name_x':'x_test'}] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 5: Input series with datetimeindex + df_expected = pd.DataFrame({'y': a_y, 'x': a_x, 'date': a_date})[['date', 'x', 'y']] + + l_input = [ + [pd.Series(a_y, name='y', index=a_date), {}], + [pd.Series(a_y, name='y_test', index=a_date), {'col_name_y': 'y_test'}], + # [pd.DataFrame({'y_test': a_y, 'x_test': a_x}), {'col_name_y':'y_test','col_name_x':'x_test'}] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 6: Input df, output with x, y, weight, date, source columns + df_expected = ( + pd.DataFrame({'y': a_y, 'x':a_x2, 'source': a_source, 'weight':a_weight,'date':a_date2}) + [['date','source','x','y','weight']] + ) + + l_input = [ + [pd.DataFrame({'y': a_y, 'weight':a_weight, 'date':a_date2, 'source': a_source}),{}], + # Datetime index not supported with source - could be added back with multindex + #[pd.DataFrame({'y': a_y, 'weight': a_weight},index = a_date), {}], + [pd.DataFrame({'y': a_y, 'x': a_x2, 'weight': a_weight,'source':a_source,'date':a_date2}), {}], + [pd.DataFrame({'y_test': a_y, 'x_test': a_x2, 'weight_test':a_weight, 'date_test':a_date2, + 'source_test':a_source}), + {'col_name_y':'y_test','col_name_x':'x_test','col_name_weight':'weight_test', 'col_name_date':'date_test', + 'col_name_source':'source_test'}] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 7: Input df has multiple values per date per source + df_expected = ( + pd.DataFrame({'y': a_y, 'x':a_x2_out, 'weight':a_weight,'date':a_date2_out}) + [['date','x','y','weight']] + ) + + l_input = [ + [pd.DataFrame({'y': a_y, 'weight': a_weight, 'date': a_date2}), {}], + # Datetime index not supported with source - could be added back with multindex + [pd.DataFrame({'y': a_y, 'x': a_x2, 'weight': a_weight, 'date': a_date2}), {}], + [pd.DataFrame({'y_test': a_y, 'x_test': a_x2, 'weight_test': a_weight, 'date_test': a_date2, + }), + {'col_name_y': 'y_test', 'col_name_x': 'x_test', 'col_name_weight': 'weight_test', + 'col_name_date': 'date_test', + }] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 8: input df has date column in string form + a_date_str = a_date2.astype(str) + df_expected = ( + pd.DataFrame({'y': a_y, 'x':a_x2, 'source': a_source, 'weight':a_weight,'date':a_date2}) + [['date','source','x','y','weight']] + ) + + l_input = [ + [pd.DataFrame({'y': a_y, 'weight': a_weight, 'date': a_date_str, 'source': a_source}), {}], + [pd.DataFrame({'y': a_y, 'x': a_x2, 'weight': a_weight, 'source': a_source, 'date': a_date_str}), {}], + [pd.DataFrame({'y_test': a_y, 'x_test': a_x2, 'weight_test': a_weight, 'date_test': a_date_str, + 'source_test': a_source}), + {'col_name_y': 'y_test', 'col_name_x': 'x_test', 'col_name_weight': 'weight_test', + 'col_name_date': 'date_test', + 'col_name_source': 'source_test'}] + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 9: unordered input df + + df_expected = pd.DataFrame({'y': a_y, 'x':a_x,})[['x','y']] + + l_input = [ + [pd.DataFrame({'y': a_y[::-1]}),{}], + [pd.DataFrame({'y': a_y[::-1], 'x': a_x[::-1]}), {}], + ] + for df, kwargs in l_input: + run_test(df, df_expected, **kwargs) + + # Test 10: candy production dataset + path_candy = os.path.join(base_folder, 'candy_production.csv') + df_candy_raw = pd.read_csv(path_candy) + df_candy = df_candy_raw.pipe(normalize_df, + col_name_y='IPG3113N', col_name_date='observation_date') + logger_info('df_candy:', df_candy.tail()) + + # Test 11: test_normalize.csv + + path_file = os.path.join(base_folder, 'test_normalize.csv') + df_test_raw = pd.read_csv(path_file) + df_test = df_test_raw.pipe(normalize_df,) + logger_info('df_test:', df_test.x.diff().loc[df_test.x.diff()>1.0]) + self.assertFalse((df_test.x.diff()>1.0).any()) + + # Test 11b: test_normalize.csv, with gaps + + path_file = os.path.join(base_folder, 'test_normalize.csv') + df_test_raw = pd.read_csv(path_file) + df_test_raw = pd.concat([df_test_raw.head(10), df_test_raw.tail(10)]) + df_test = df_test_raw.pipe(normalize_df,) + logger_info('df_test:',df_test) + logger_info('df_test:', df_test.x.diff().loc[df_test.x.diff()>1.0]) + self.assertTrue((df_test.x.max()==43)) + + def test_interpolate_df(self): + + # # Test 1: DF with date column, gap + # a_y = np.arange(0,10.) + # a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + # df_expected = pd.DataFrame({'y': a_y, 'date': a_date}).pipe(normalize_df) + # df = pd.concat([df_expected.head(5), df_expected.tail(-6)]).pipe(normalize_df) + # + # df_result = df.pipe(interpolate_df) + # logger_info('df_result:', df_result) + # self.assert_frame_equal(df_result, df_expected) + # + # df_result = df.pipe(interpolate_df, include_mask=True) + # + # # Test 1: DF with no date column, gap + # a_y = np.arange(0,10.) + # a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + # df_expected = pd.DataFrame({'y': a_y}).pipe(normalize_df) + # df = pd.concat([df_expected.head(5), df_expected.tail(-6)]).pipe(normalize_df) + # + # df_result = df.pipe(interpolate_df) + # logger_info('df_result:', df_result) + # self.assert_frame_equal(df_result, df_expected) + # + # df_result = df.pipe(interpolate_df, include_mask=True) + # logger_info('df_result:', df_result) + + + # Test 2: Sparse series with date gaps + df_test = pd.DataFrame({'date': pd.to_datetime(['2018-08-01', '2018-08-09']), 'y': [1., 2.]}) + df_result = df_test.pipe(interpolate_df, include_mask=True) + logger_info('df_result:', df_result) + self.assertEqual(df_result.index.size,9) + + + + + def test_forecast_input(self): + y_values1 = pd.DataFrame({'a': np.full(100, 0.0), + 'b': np.round(np.arange(-0.5, 0.5, 0.01), 2), }, + index=pd.date_range('2014-01-01', periods=100, freq='D')) + # Too few samples + n = 4 + y_values1b = pd.DataFrame({'a': np.full(n, 0.0)}, + index=pd.date_range('2014-01-01', periods=n, freq='D')) + + y_values2 = pd.DataFrame({'a': np.full(100, 0.0)}, + index=pd.date_range('2014-01-01', periods=100, freq='D')) + + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=[forecast_models.model_constant, forecast_models.model_linear], + l_model_season=None, df_y=y_values1, + weights_y_values=1.0, date_start_actuals=None + ) + logger_info('Solver config:', conf1) + + def test_get_residuals(self): + # Linear model + model = forecast_models.model_linear + a_y = np.arange(10.0) + a_x = np.arange(10.0) + a_date = None + # Using parameter(0,0) + residuals = get_residuals([0, 0], model, a_x, a_y, a_date) + l_expected1 = np.arange(10.0) + logger_info('residuals:', residuals) + self.assert_array_equal(residuals, l_expected1) + + # Test - If input array is not 1-dimensional, throw Exception + model = forecast_models.model_linear + a_y = pd.DataFrame({'a': np.arange(10.0), 'b': -np.arange(10.0)}).values + a_x = np.arange(10.0) + with self.assertRaises(AssertionError): + residuals = get_residuals([0, 0], model, a_x, a_y, a_date) + + # Test - multiple values per sample + a_y = np.concatenate([np.arange(10.0), -np.arange(10.0)]) + a_x = np.tile(np.arange(10.0), 2) + + residuals = get_residuals([0, 0], model, a_x, a_y, a_date) + logger_info('residuals:', residuals) + l_expected2 = np.concatenate([np.arange(10.0), np.arange(10.0)]) + self.assert_array_equal(residuals, l_expected2) + + # As above, but applying weights to input time series [1.0, 0] + residuals = get_residuals([0, 0], model, a_x, a_y, a_date, + a_weights=np.repeat([1.0, 0], 10)) + l_expected2b = np.concatenate([np.arange(10.0), np.full(10, 0)]) + logger_info('residuals:', residuals) + self.assert_array_equal(residuals, l_expected2b) + + # TODO: MORE TESTS WITH WEIGHTS_Y_VALUES + + # New test, different parameters + residuals = get_residuals([0, 5], model, a_x, a_y, a_date) + logger_info('residuals:', residuals) + self.assert_array_equal(residuals, + [5., 4., 3., 2., 1., 0., 1., 2., 3., 4., 5., 6., 7., + 8., 9., 10., 11., 12., 13., 14.]) + + # Test - Use a_weights to weight residuals based on time + # Using parameter(0,0) + a_y = np.arange(10.0) + a_x = np.arange(10.0) + a_weights = np.linspace(1., 2., 10) + logger_info('a_y: ', a_y) + logger_info('a_weights: ', a_weights) + residuals = get_residuals([0, 0], model, a_x, a_y, a_date, a_weights=a_weights) + self.assert_array_equal(residuals, np.arange(10.0) * a_weights) + logger_info('residuals:', residuals) + + def test_optimize_least_squares(self): + # Setup + a_x = pd.np.arange(100.0) + a_y = np.arange(100.0) + + a_x_long = np.tile(a_x, 2) + a_y_long = np.concatenate([np.full(100, 0.0), + np.round(np.arange(-0.5, 0.5, 0.01), 2)]) + a_date = None + + l_model = [ + forecast_models.model_linear, + forecast_models.model_constant + ] + + def print_result(result): + logger.info('result cost: %s, shape: %s, x: %s, message: %s', + result.cost, result.fun.shape, result.x, result.message) + + for model in l_model: + logger.info('#### Model function: %s', model.name) + + df_result = optimize_least_squares(model, a_x, a_y, a_date) + logger_info('result:', df_result) + self.assertTrue(df_result.success.any()) + # logger_info('result.x:',res_trend.x) + + df_result = optimize_least_squares(model, a_x_long, a_y_long, a_date) + logger_info('result:', df_result) + self.assertTrue(df_result.success.any()) + + def test_fit_model(self): + # Input dataframes must have an y column, and may have columns x,date, weight + + # Setup + # TODO: Use pre-normalized input dfs, rather than callling normalize_df() + dict_df_y = { + # Single ts + 'df_1ts_nodate': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0)}), + # 2 ts + 'df_2ts_nodate': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), + np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + 'x': np.tile(np.arange(0, 100), 2), + }), + # 1 ts with datetime index + 'df_1ts_w': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0) + }, + index=pd.date_range('2014-01-01', periods=100, freq='W')), + # 2 ts with datetime index + 'df_2ts_w': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + }, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='W'), 2)), + # Single ts, freq=D + 'df_1ts_d': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0)}, + index=pd.date_range('2014-01-01', periods=100, freq='D')), + # 2 ts with datetime index, freq=D + 'df_2ts_d': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)])}, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='D'), 2)) + } + l_source1 = ['df_1ts_nodate', 'df_2ts_nodate', 'df_1ts_w', 'df_1ts_w', 'df_2ts_d', 'df_2ts_d'] + l_source2 = ['df_1ts_d', 'df_2ts_d'] + + # Naive trend models - cannot add seasonality + l_model1a = [ + forecast_models.model_naive, # model_naive never actually goes to fit_model + # TODO: add assert check on fit model re: validity of input model + + ] + + l_model1b = [ + forecast_models.model_snaive_wday + # TODO: add assert check on fit model re: validity of input model + + ] + + l_model1c = [ + forecast_models.model_linear, + forecast_models.model_constant + ] + # All trend models + l_model1 = l_model1a+l_model1b+l_model1c + + l_model2 = [ + forecast_models.model_season_wday, + forecast_models.model_season_wday_2, + forecast_models.model_season_month + ] + l_model3 = get_list_model(l_model1c, l_model2) + + l_results = [] + l_optimize_info = [] + + l_add_weight = [False, True] + + def run_test_logic(source, model, add_weight): + df_y = dict_df_y[source].copy() + if add_weight: # Enable weight column + df_y['weight'] = df_y['weight_test'] + df_y = df_y.pipe(normalize_df) + logger.info('Fitting src: %s , mod: %s, add_weight: %s', source, model, add_weight) + dict_fit_model = fit_model(model, df_y, source=source, df_actuals = df_y) + return dict_fit_model + # logger_info('Result: ',result) + + # Test - single solver type, return best fit + for (source, model, add_weight) in itertools.product( + l_source1, l_model1a+l_model1c, l_add_weight): + dict_fit_model = run_test_logic(source, model, add_weight) + result_tmp = dict_fit_model['metadata'] + info_tmp = dict_fit_model['optimize_info'] + l_results += [result_tmp] + l_optimize_info += [info_tmp] + + # Now for models that require datetimeindex + for (source, model, add_weight) in itertools.product( + l_source2, l_model1b+l_model2, l_add_weight): + dict_fit_model = run_test_logic(source, model, add_weight) + result_tmp = dict_fit_model['metadata'] + info_tmp = dict_fit_model['optimize_info'] + l_results += [result_tmp] + l_optimize_info += [info_tmp] + + # Finally, we use trend+seasonality with all models + for (source, model, add_weight) in itertools.product( + l_source2, l_model3, l_add_weight): + dict_fit_model = run_test_logic(source, model, add_weight) + result_tmp = dict_fit_model['metadata'] + info_tmp = dict_fit_model['optimize_info'] + l_results += [result_tmp] + l_optimize_info += [info_tmp] + + df_result = pd.concat(l_results, sort=False, ignore_index=True) + df_optimize_info = pd.concat(l_optimize_info, sort=False, ignore_index=True) + + self.assertFalse(df_result.cost.pipe(pd.isnull).any()) + + logger_info('Result summary:', df_result) + logger_info('Optimize info summary:', df_optimize_info) + + @unittest.skip('Dask not supported yet') + def test_fit_model_dask(self): + # Input dataframes must have an y column, and may have columns x,date, weight + + # Setup + # TODO: Use pre-normalized input dfs, rather than callling normalize_df() + dict_df_y = { + # Single ts + 'df_1ts_nodate': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0)}), + # 2 ts + 'df_2ts_nodate': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), + np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + 'x': np.tile(np.arange(0, 100), 2), + }), + # 1 ts with datetime index + 'df_1ts_w': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0) + }, + index=pd.date_range('2014-01-01', periods=100, freq='W')), + # 2 ts with datetime index + 'df_2ts_w': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + }, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='W'), 2)), + # Single ts, freq=D + 'df_1ts_d': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0)}, + index=pd.date_range('2014-01-01', periods=100, freq='D')), + # 2 ts with datetime index, freq=D + 'df_2ts_d': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)])}, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='D'), 2)) + } + l_source1 = ['df_1ts_nodate', 'df_2ts_nodate', 'df_1ts_w', 'df_1ts_w', 'df_2ts_d', 'df_2ts_d'] + l_source2 = ['df_1ts_d', 'df_2ts_d'] + + l_model1 = [ + forecast_models.model_naive, # model_naive never actually goes to fit_model + # TODO: add assert check on fit model re: validity of input model + forecast_models.model_linear, + forecast_models.model_constant + ] + l_model2 = [ + forecast_models.model_season_wday, + forecast_models.model_season_wday_2, + forecast_models.model_season_month + ] + l_model3 = get_list_model(l_model1, l_model2) + + l_add_weight = [False, True] + + def run_test_logic(df_y, source, model, add_weight): + #df_y = dict_df_y[source].copy() + #if add_weight: # Enable weight column + # df_y['weight']=df_y['weight_test'] + col_name_weight = 'weight' if add_weight==True else 'no-weight' + df_y = df_y.pipe(normalize_df, col_name_weight=col_name_weight) + #logger.info('Fitting src: %s , mod: %s, add_weight: %s', source, model, add_weight) + #dict_fit_model = delayed(fit_model)(model, df_y, source=source, df_actuals = df_y) + dict_fit_model = fit_model(model, df_y, source=source, df_actuals=df_y) + return dict_fit_model + # logger_info('Result: ',result) + + def aggregate_dict_fit_model(l_dict_fit_model): + l_results = [] + l_optimize_info = [] + for dict_fit_model in l_dict_fit_model: + result_tmp = dict_fit_model['metadata'] + info_tmp = dict_fit_model['optimize_info'] + l_results += [result_tmp] + l_optimize_info += [info_tmp] + df_metadata = pd.concat(l_results, sort=False, ignore_index=True) + df_optimize_info = pd.concat(l_optimize_info, sort=False, ignore_index=True) + return df_metadata, df_optimize_info + + l_dict_fit_model_d = [] + + # Test - single solver type, return best fit + for (source, model, add_weight) in itertools.product( + l_source1, l_model1, l_add_weight): + l_dict_fit_model_d += [delayed(run_test_logic)(dict_df_y[source].copy(), source, model, add_weight)] + + + # Now for models that require datetimeindex + for (source, model, add_weight) in itertools.product( + l_source2, l_model2, l_add_weight): + l_dict_fit_model_d += [delayed(run_test_logic)(dict_df_y[source].copy(), source, model, add_weight)] + + # Finally, we use trend+seasonality with all models + for (source, model, add_weight) in itertools.product( + l_source2, l_model3, l_add_weight): + l_dict_fit_model_d += [delayed(run_test_logic)(dict_df_y[source].copy(), source, model, add_weight)] + + logger.info('generated delayed') + + #client = Client() + #logger_info('client:',client) + #l_dict_fit_model, = compute(l_dict_fit_model_d) + l_dict_fit_model, = compute_prof(l_dict_fit_model_d, scheduler='processes', num_workers=4) + #l_dict_fit_model, = compute(l_dict_fit_model_d, scheduler='processes', num_workers=4) + #l_dict_fit_model, = compute(l_dict_fit_model_d, scheduler='distributed', num_workers=4) + #l_dict_fit_model, = compute(l_dict_fit_model_d, scheduler='threads', num_workers=4) + #l_dict_fit_model = l_dict_fit_model_d + + df_metadata, df_optimize_info = aggregate_dict_fit_model(l_dict_fit_model) + #result_d = delayed(aggregate_dict_fit_model)(l_dict_fit_model_d) + #result_d = delayed(aggregate_dict_fit_model)(l_dict_fit_model_d) + #(df_metadata, df_optimize_info), = compute(result_d) + # result, = compute(result_d) + logger_info('Result summary:', df_metadata) + logger_info('Optimize info summary:', df_optimize_info) + #client.close() + + @unittest.skip('Dask not supported yet') + def test_fit_model_dask2(self): + # Input dataframes must have an y column, and may have columns x,date, weight + + # Setup + # TODO: Use pre-normalized input dfs, rather than callling normalize_df() + dict_df_y = { + # Single ts + 'df_1ts_nodate': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0)}), + # 2 ts + 'df_2ts_nodate': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), + np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + 'x': np.tile(np.arange(0, 100), 2), + }), + # 1 ts with datetime index + 'df_1ts_w': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0) + }, + index=pd.date_range('2014-01-01', periods=100, freq='W')), + # 2 ts with datetime index + 'df_2ts_w': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + }, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='W'), 2)), + # Single ts, freq=D + 'df_1ts_d': pd.DataFrame({'y': np.full(100, 0.0), + 'weight_test': np.full(100, 1.0)}, + index=pd.date_range('2014-01-01', periods=100, freq='D')), + # 2 ts with datetime index, freq=D + 'df_2ts_d': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)])}, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='D'), 2)) + } + l_source1 = ['df_1ts_nodate', 'df_2ts_nodate', 'df_1ts_w', 'df_1ts_w', 'df_2ts_d', 'df_2ts_d'] + l_source2 = ['df_1ts_d', 'df_2ts_d'] + + l_model1 = [ + forecast_models.model_naive, # model_naive never actually goes to fit_model + # TODO: add assert check on fit model re: validity of input model + forecast_models.model_linear, + forecast_models.model_constant + ] + l_model2 = [ + forecast_models.model_season_wday, + forecast_models.model_season_wday_2, + forecast_models.model_season_month + ] + l_model3 = get_list_model(l_model1, l_model2) + + l_weight = ['no-weight', 'weight_test'] + + def run_test_logic(df_y, source, model, add_weight): + #df_y = dict_df_y[source].copy() + if add_weight: # Enable weight column + df_y['weight']=df_y['weight_test'] + df_y = df_y.pipe(normalize_df) + #logger.info('Fitting src: %s , mod: %s, add_weight: %s', source, model, add_weight) + #dict_fit_model = delayed(fit_model)(model, df_y, source=source, df_actuals = df_y) + dict_fit_model = fit_model(model, df_y, source=source, df_actuals=df_y) + return dict_fit_model + # logger_info('Result: ',result) + + def aggregate_dict_fit_model(l_dict_fit_model): + l_results = [] + l_optimize_info = [] + for dict_fit_model in l_dict_fit_model: + result_tmp = dict_fit_model['metadata'] + info_tmp = dict_fit_model['optimize_info'] + l_results += [result_tmp] + l_optimize_info += [info_tmp] + df_metadata = delayed(pd.concat)(l_results, sort=False, ignore_index=False) + df_optimize_info = delayed(pd.concat)(l_optimize_info, sort=False, ignore_index=False) + return df_metadata, df_optimize_info + + l_dict_fit_model_d = [] + + # Test - single solver type, return best fit + + + l_dict_fit_model_d += [ + delayed(fit_model)(model, + dict_df_y[source].pipe(delayed(normalize_df), col_name_weight=weight), + source=source, df_actuals=dict_df_y[source].pipe(delayed(normalize_df), col_name_weight=weight)) + for (source, model, weight) in itertools.product(l_source1, l_model1, l_weight)] + + # Now for models that require datetimeindex + l_dict_fit_model_d += [ + delayed(fit_model)(model, + dict_df_y[source].pipe(delayed(normalize_df), col_name_weight=weight), + source=source, df_actuals=dict_df_y[source].pipe(delayed(normalize_df), col_name_weight=weight)) + for (source, model, weight) in itertools.product(l_source2, l_model2, l_weight)] + + # Finally, we use trend+seasonality with all models + l_dict_fit_model_d += [ + delayed(fit_model)(model, + dict_df_y[source].pipe(delayed(normalize_df), col_name_weight=weight), + source=source, df_actuals=dict_df_y[source].pipe(delayed(normalize_df), col_name_weight=weight)) + for (source, model, weight) in itertools.product( l_source2, l_model3, l_weight)] + + # # Finally, we use trend+seasonality with all models + # for (source, model, weight) in itertools.product( + # l_source2, l_model3, l_weight): + # df_y = dict_df_y[source].pipe(normalize_df, col_name_weight=weight) + # l_dict_fit_model_d += [delayed(fit_model)(model, df_y, source=source, df_actuals=df_y)] + + logger.info('generated delayed') + + #l_dict_fit_model, = compute(l_dict_fit_model_d) + l_dict_fit_model = l_dict_fit_model_d + + #df_metadata, df_optimize_info = aggregate_dict_fit_model(l_dict_fit_model) + result_d = delayed(aggregate_dict_fit_model)(l_dict_fit_model_d) + (df_metadata, df_optimize_info), = compute(result_d) + # result, = compute(result_d) + logger_info('Result summary:', df_metadata) + logger_info('Optimize info summary:', df_optimize_info) + + @unittest.skip('Dask not supported yet') + def test_dask(self): + def aggregate_result(l_dict_result): + l_metadata = [] + l_opt = [] + for dict_result in l_dict_result: + l_metadata += [dict_result['metadata']] + l_opt += [dict_result['optimize_info']] + return pd.concat(l_metadata, sort=False, ignore_index=False), pd.concat(l_opt, sort=False, + ignore_index=False) + + model = forecast_models.model_linear + df_y = pd.DataFrame({'y': np.full(100, 0.0), 'weight_test': np.full(100, 1.0)}).pipe(normalize_df) + + l_dict_result2_d = [delayed(fit_model)(model, df_y, source=i, df_actuals=df_y) for i in np.arange(0, 20)] + result_d = delayed(aggregate_result)(l_dict_result2_d) + #result = compute(result_d,scheduler='processes',num_workers=2) + result = compute(result_d) + logger_info('result',result) + + + def test_fit_model_date_gaps(self): + # Setup + # 2 ts with datetime index, freq=D + df_2ts_d = pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)])}, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='D'), 2)) + + df_y = pd.concat([df_2ts_d.head(), df_2ts_d.tail()]) + + model = forecast_models.model_linear + + l_col_name_weight = [None, 'weight'] + + l_results = [] + l_optimize_info = [] + + def run_test_logic(col_name_weight): + logger.info('Fitting col_w: %s', col_name_weight) + df_y_tmp = df_y.pipe(normalize_df,col_name_weight=col_name_weight) + dict_fit_model = fit_model(model, df_y_tmp, source='test') + return dict_fit_model + # logger_info('Result: ',result) + + # Test - single solver type, return best fit + for col_name_weight in l_col_name_weight: + dict_fit_model = run_test_logic(col_name_weight) + result_tmp = dict_fit_model['metadata'] + info_tmp = dict_fit_model['optimize_info'] + l_results += [result_tmp] + l_optimize_info += [info_tmp] + + df_result = pd.concat(l_results) + df_optimize_info = pd.concat(l_optimize_info) + logger_info('Result summary:', df_result) + logger_info('Optimize info summary:', df_optimize_info) + + def test_get_list_model(self): + l1 = [ + forecast_models.model_linear, + forecast_models.model_constant + ] + l2 = [ + forecast_models.model_season_wday_2, + forecast_models.model_null + ] + l_result_add = get_list_model(l1, l2, 'add') + l_result_mult = get_list_model(l1, l2, 'mult') + l_result_both = get_list_model(l1, l2, 'both') + + l_expected_add = [ + l1[0] + l2[0], + l1[0] + l2[1], + l1[1] + l2[0], + l1[1] + l2[1], + ] + + l_expected_mult = [ + l1[0] * l2[0], + l1[0] * l2[1], + l1[1] * l2[0], + l1[1] * l2[1], + ] + l_expected_both = [ + l1[0] + l2[0], + l1[0] + l2[1], + l1[1] + l2[0], + l1[1] + l2[1], + l1[0] * l2[0], + # l1[0] * l2[1], # This is a duplicate: linear*null = linear+null = linear + l1[1] * l2[0], + # l1[1] * l2[1], # This is a duplicate: constant*null = constant+null = constant + ] + logger_info('Result add:', l_result_add) + logger_info('Expected add:', l_expected_add) + self.assertListEqual(l_result_add, l_expected_add) + + logger_info('Result mult:', l_result_mult) + logger_info('Expected mult:', l_expected_mult) + self.assertListEqual(l_result_mult, l_expected_mult) + + logger_info('Result both:', l_result_both) + logger_info('Expected both:', l_expected_both) + self.assertListEqual(l_result_both, l_expected_both) + + def test_fit_model_trend_season_wday_mult(self): + # Test Specific model combination that doesn't fit + + # Setup + n_iterations = 10 + + # Setup + dict_df_y = { + # Single ts + 'df_1ts_nodate': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0)}), + # 2 ts + 'df_2ts_nodate': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), + np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + 'x': np.tile(np.arange(0, 100), 2), + }), + # 1 ts with datetime index + 'df_1ts_w': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0) + }, + index=pd.date_range('2014-01-01', periods=100, freq='W')), + + # 2 ts with datetime index + 'df_2ts_w': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + }, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='W'), 2)), + # Single ts, freq=D + 'df_1ts_d': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0)}, + index=pd.date_range('2014-01-01', periods=100, freq='D')), + + # Single ts, freq=D , index named 'date + 'df_1ts_d2': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0)}, + index=pd.date_range('2014-01-01', periods=100, freq='D', name='date')) + .reset_index() + , + + # 2 ts with datetime index, freq=D + 'df_2ts_d': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)])}, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq='D'), 2)) + } + + l_source_d = ['df_1ts_d', 'df_2ts_d','df_1ts_d2'] + l_source_w = ['df_1ts_2', 'df_2ts_2'] + + l_model_trend = [ + forecast_models.model_linear, + ] + l_model_season = [ + # forecast_models.model_season_wday, + # forecast_models.model_season_wday, + forecast_models.model_season_wday_2, + forecast_models.model_null + ] + + l_col_name_weight = [ # None, + 'weight'] + + l_results = [] + l_optimize_info = [] + + # Fit , run n iterations, freq='D' + for (source, col_name_weight, model) in itertools.product( + l_source_d, l_col_name_weight, get_list_model(l_model_trend, l_model_season, 'both')): + df_y = dict_df_y[source].copy().pipe(normalize_df,col_name_weight=col_name_weight) + logger.info('Fitting src: %s , mod: %s, col_w: %s', source, model, col_name_weight) + for i in np.arange(0, n_iterations): + dict_fit_model = fit_model(model, df_y, source=source, freq='D') + l_results += [dict_fit_model['metadata']] + l_optimize_info += [dict_fit_model['optimize_info']] + + # Fit , run n iterations, freq='D' - test function composition in different order + for (source, col_name_weight, model) in itertools.product( + l_source_d, l_col_name_weight, get_list_model(l_model_season, l_model_trend, 'both')): + df_y = dict_df_y[source].copy().pipe(normalize_df, col_name_weight=col_name_weight) + logger.info('Fitting src: %s , mod: %s, col_w: %s', source, model, col_name_weight) + for i in np.arange(0, n_iterations): + dict_fit_model = fit_model(model, df_y, source=source, freq='D') + l_results += [dict_fit_model['metadata']] + l_optimize_info += [dict_fit_model['optimize_info']] + + df_result = pd.concat(l_results) + df_optimize_info = pd.concat(l_optimize_info) + logger_info('Result summary:', df_result) + logger_info('Optimize info summary:', df_optimize_info) + + def test_extrapolate_model(self): + # with freq=None, defaults to W + df_y_forecast = extrapolate_model(forecast_models.model_constant, [1.0], + '2017-01-01', '2017-01-01', freq=None, extrapolate_years=1.0) + logger_info('df_y_forecast', df_y_forecast.tail(1)) + logger_info('Result length:', df_y_forecast.index.size) + self.assertEquals(df_y_forecast.index.size, 53) + + df_y_forecast = extrapolate_model(forecast_models.model_constant, [1.0], + '2017-01-01', '2017-12-31', freq='D', extrapolate_years=1.0) + logger_info('df_y_forecast', df_y_forecast.tail(1)) + logger_info('Result length:', df_y_forecast.index.size) + self.assertEquals(df_y_forecast.index.size, 365 * 2) + + df_y_forecast = extrapolate_model(forecast_models.model_constant, [1.0], + '2017-01-01', '2017-12-31', freq='MS', extrapolate_years=1.0) + logger_info('df_y_forecast', df_y_forecast.tail(1)) + logger_info('Result length:', df_y_forecast.index.size) + self.assertEquals(df_y_forecast.index.size, 12 * 2) + + df_y_forecast = extrapolate_model(forecast_models.model_constant, [1.0], + '2000-01-01', '2009-01-01', freq='YS', extrapolate_years=10.0) + logger_info('df_y_forecast', df_y_forecast.tail(20)) + logger_info('Result length:', df_y_forecast.index.size) + self.assertEquals(df_y_forecast.index.size, 20) + + # TODO: Test other time frequencies, e.g. Q, H, Y. + + def test_get_df_actuals_clean(self): + dict_df_y = { + # Single ts + 'df_1ts_nodate': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0)}), + # 2 ts + 'df_2ts_nodate': pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), + np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + 'x': np.tile(np.arange(0, 100), 2), + }), + # 1 ts with datetime index + 'df_1ts_w': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0) + }, + index=pd.date_range('2014-01-01', periods=100, freq='W')), + # 1 ts with datetime index named 'date + 'df_1ts_w-2': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0) + }, + index=pd.date_range('2014-01-01', periods=100, freq='W', name='date')), + # 1 ts with datetime column + 'df_1ts_w-3': pd.DataFrame({'y': np.full(100, 0.0), + 'weight': np.full(100, 1.0), + 'date': pd.date_range('2014-01-01', periods=100, freq='W') + }) + } + # Simple test - check for crashes + + for k in dict_df_y.keys(): + logger.info('Input: %s', k) + df_in = dict_df_y.get(k).pipe(normalize_df) + logger_info('DF_IN',df_in.tail(3)) + df_result = get_df_actuals_clean(df_in,'test','test') + logger_info('Result:', df_result.tail(3)) + unique_models = df_result.model.drop_duplicates().reset_index(drop=True) + self.assert_series_equal(unique_models, pd.Series(['actuals'])) + logger_info('Models:', df_result.model.drop_duplicates()) + + def _test_run_forecast_basic_tests_new_api(self, n_sources=1, **kwargs): + # Both additive and multiplicative + dict_result = run_forecast(simplify_output=False, **kwargs) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + l_sources = df_metadata.source_long.unique() + include_all_fits = kwargs.get('include_all_fits') + if not include_all_fits: # In this case, there should be only one model fitted per data source + self.assertTrue(df_metadata.is_best_fit.all()) + self.assertTrue((df_data.is_best_fit | df_data.is_actuals).all()) + # The following may not be true if a model doesn't converge + self.assertEquals(df_metadata.index.size, n_sources) + self.assertEquals(df_data.loc[~df_data.is_actuals].drop_duplicates('source_long').index.size, + n_sources) + + # Check that actuals are included + self.assertTrue((df_data.is_actuals.any())) + + # Check that dtype is not corrupted + self.assertTrue(np.issubdtype(df_data.y.astype(float), np.float64)) + + def _test_run_forecast_check_length_new_api(self, **kwargs): + freq = kwargs.get('freq', 'D') + + freq = detect_freq(kwargs.get('df_y').pipe(normalize_df)) + + freq_short = freq[0:1] if freq is not None else None # Changes e.g. W-MON to W + freq_units_per_year = 52.0 if freq_short == 'W' else 365.0 # Todo: change to dict to support more frequencies + + extrapolate_years = kwargs.get('extrapolate_years', 1.0) + + # Both additive and multiplicative + dict_result = run_forecast(simplify_output=False, extrapolate_years=extrapolate_years, **kwargs) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + df_data_size = df_data.groupby(['source', 'model', 'is_actuals']).size().rename('group_size').reset_index() + df_data_size_unique = ( + df_data.drop_duplicates(['source', 'model', 'is_actuals', 'date']) + .groupby(['source', 'model', 'is_actuals']).size().rename('group_size').reset_index() + ) + logger_info('df_data_size:', df_data_size) + logger_info('df_data_size_unique:', df_data_size_unique) + + df_y = kwargs.get('df_y') + assert df_y is not None + + # Normalize df_y + df_y = normalize_df(df_y, + kwargs.get('col_name_y', 'y'), + kwargs.get('col_name_weight', 'weight'), + kwargs.get('col_name_x', 'x'), + kwargs.get('col_name_date', 'date'), + kwargs.get('col_name_source', 'source')) + if 'source' not in df_y.columns: + df_y['source'] = kwargs.get('source_id', 'source') + + l_sources = df_y.source.drop_duplicates() + + for source in l_sources: + df_y_tmp = df_y.loc[df_y.source == source] + + size_actuals_unique_tmp = df_y_tmp.drop_duplicates('x').index.size + size_actuals_tmp = df_y_tmp.index.size + + df_data_size_tmp = df_data_size.loc[df_data_size.source == source] + df_data_size_actuals = df_data_size_tmp.loc[df_data_size_tmp.is_actuals] + df_data_size_fcast = df_data_size_tmp.loc[~df_data_size_tmp.is_actuals] + + # logger.info('DEBUG: group size: %s',100 + extrapolate_years*freq_units_per_year) + # This assert doesn't work for all years - some have 365 days, some 366. Currently running with 365-day year + + logger.info('DEBUG: df_data_size_fcast.group_size %s , size_actuals_tmp %s, total %s', + df_data_size_fcast.group_size.values, size_actuals_tmp, + size_actuals_tmp + extrapolate_years * freq_units_per_year) + + self.assertTrue((df_data_size_actuals.group_size == size_actuals_tmp).all()) + + self.assert_array_equal(df_data_size_fcast.group_size, + size_actuals_unique_tmp + extrapolate_years * freq_units_per_year) + self.assertFalse(df_data_size_fcast.empty) + + def _test_run_forecast(self, freq='D'): + # freq_short = freq[0:1] # Changes e.g. W-MON to W + # freq_units_per_year = 52.0 if freq_short == 'W' else 365.0 # Todo: change to dict to support more frequencies + + # Input dataframe without date column + df_y0 = pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + }, + ) + + df_y1 = pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + }, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq=freq), 2)) + + # Too few samples + n = 4 + df_y1b = pd.DataFrame({'y': np.full(n, 0.0)}, + index=pd.date_range('2017-01-01', periods=n, freq=freq)) + + df_y2 = pd.DataFrame({'y': np.full(100, 0.0)}, + index=pd.date_range('2017-01-01', periods=100, freq=freq)) + + # Df with source column + df_y3 = pd.DataFrame({'y': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + 'source': ['src1'] * 100 + ['src2'] * 100 + }, + index=np.tile(pd.date_range('2014-01-01', periods=100, freq=freq), 2)) + # As above, with renamed columns + df_y3b = pd.DataFrame({'y_test': np.concatenate([np.full(100, 0.0), np.round(np.arange(-0.5, 0.5, 0.01), 2)]), + 'weight_test': np.concatenate([np.full(100, 0.1), np.full(100, 1.0)]), + 'source_test': ['src1'] * 100 + ['src2'] * 100, + 'date_test': np.tile(pd.date_range('2014-01-01', periods=100, freq=freq), 2) + }) + + # # Model lists + l_model_trend1 = [forecast_models.model_linear] + l_model_trend1b = [forecast_models.model_linear, forecast_models.model_season_wday_2] + l_model_trend2 = [forecast_models.model_linear, forecast_models.model_exp] + + l_model_season1 = [forecast_models.model_season_wday_2] + l_model_season2 = [forecast_models.model_season_wday_2, forecast_models.model_null] + # + # # # Test input with source column, multiple sources + # self._test_run_forecast_basic_tests_new_api(df_y=df_y3, include_all_fits=True, + # l_model_trend=l_model_trend2, l_model_season=l_model_season2) + # self._test_run_forecast_basic_tests_new_api(df_y=df_y3b, include_all_fits=True, + # l_model_trend=l_model_trend2, l_model_season=l_model_season2, + # col_name_y='y_test', col_name_date='date_test', + # col_name_source='source_test', col_name_weight='weight_test') + + ## New test - forecast length + logger.info('Testing Output Length') + self._test_run_forecast_check_length_new_api(df_y=df_y1, include_all_fits=False, + l_model_trend=l_model_trend1b, source_id='source1') + self._test_run_forecast_check_length_new_api(df_y=df_y2, include_all_fits=False, + l_model_trend=l_model_trend2, l_model_season=l_model_season2, + source_id='source2') + + def test_runforecast(self): + for freq in ['D', + 'W']: + self._test_run_forecast(freq=freq) + + def test_run_forecast_simple_linear_model(self): + df1 = pd.DataFrame({'y': np.arange(0, 10.)}, + index=pd.date_range('2014-01-01', periods=10, freq='D')) + dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend=[forecast_models.model_linear]) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(30)) + + df2 = pd.DataFrame({'y': np.arange(0, 10.), 'source': ['src1'] * 5 + ['src2'] * 5}, + index=pd.date_range('2014-01-01', periods=10, freq='D')) + dict_result = run_forecast(simplify_output=False, df_y=df2, l_model_trend=[forecast_models.model_linear]) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + + def test_run_forecast_naive(self): + # # Test 1 - linear series, 1 source + # df1 = pd.DataFrame({'y': np.arange(0,10.)}, + # index=pd.date_range('2014-01-01', periods=10, freq='D')) + # dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend = [forecast_models.model_naive], + # extrapolate_years=10./365) + # + # df_data = dict_result['data'] + # df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + # + # logger_info('df_metadata:', df_metadata) + # logger_info('df_optimize_info:', df_optimize_info) + # logger_info('df_data:', df_data.groupby(['source', 'model']).tail(40)) + # + # # Test 2 - 2 sources + # df2 = pd.DataFrame({'y': np.arange(0,10.),'source' : ['src1']*5 + ['src2']*5}, + # index=pd.date_range('2014-01-01', periods=10, freq='D')) + # dict_result = run_forecast(simplify_output=False, df_y=df2, l_model_trend = [forecast_models.model_naive], + # extrapolate_years=10./365) + # + # df_data = dict_result['data'] + # df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + # + # logger_info('df_metadata:', df_metadata) + # logger_info('df_optimize_info:', df_optimize_info) + # logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + # + # # test 3: weight column + # df1 = pd.DataFrame({'y': np.arange(0, 10.), 'weight': array_zeros_in_indices(10,[5,6])}, + # index=pd.date_range('2014-01-01', periods=10, freq='D')) + # dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend=[forecast_models.model_naive], + # extrapolate_years=10. / 365) + # + # df_data = dict_result['data'] + # df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + # + # logger_info('df_metadata:', df_metadata) + # logger_info('df_optimize_info:', df_optimize_info) + # logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + # + # a_y_result = df_data.loc[df_data.model=='naive'].y.values + # logger_info('a_y_result:', a_y_result) + # self.assert_array_equal(a_y_result, + # np.concatenate([ + # np.array([0., 0., 1., 2., 3., 4., 4.,4.,7.,8., 9.,]), + # np.full(9, 9.) + # ] + # )) + # + # df_forecast = dict_result['forecast'] + # logger_info('df_forecast',df_forecast) + # + # # Test 3b: weight column, season_add_mult = 'both' + # + # df1 = pd.DataFrame({'y': np.arange(0, 10.), 'weight': array_zeros_in_indices(10, [5, 6])}, + # index=pd.date_range('2014-01-01', periods=10, freq='D')) + # dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend=[forecast_models.model_naive], + # extrapolate_years=10. / 365, + # season_add_mult='both') + # + # df_data = dict_result['data'] + # df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + # + # logger_info('df_metadata:', df_metadata) + # logger_info('df_optimize_info:', df_optimize_info) + # logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + # + # a_y_result = df_data.loc[df_data.model == 'naive'].y.values + # logger_info('a_y_result:', a_y_result) + # self.assert_array_equal(a_y_result, + # np.concatenate([ + # np.array([0., 0., 1., 2., 3., 4., 4., 4., 7., 8., 9., ]), + # np.full(9, 9.) + # ] + # )) + # + # df_forecast = dict_result['forecast'] + # logger_info('df_forecast', df_forecast) + # + # # Test 4: find_outliers + # + # df1 = pd.DataFrame({'y': np.arange(0, 10.)+10*array_ones_in_indices(10,[5,6])}, + # index=pd.date_range('2014-01-01', periods=10, freq='D')) + # dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend=[forecast_models.model_naive], + # extrapolate_years=10. / 365, find_outliers=True) + # + # df_data = dict_result['data'] + # df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + # + # logger_info('df_metadata:', df_metadata) + # logger_info('df_optimize_info:', df_optimize_info) + # logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + # + # a_y_result = df_data.loc[df_data.model=='naive'].y.values + # logger_info('a_y_result:', a_y_result) + # self.assert_array_equal(a_y_result, + # np.concatenate([ + # np.array([0., 0., 1., 2., 3., 4., 4.,4.,7.,8., 9.,]), + # np.full(9, 9.) + # ] + # )) + # + # df_forecast = dict_result['forecast'] + # logger_info('df_forecast',df_forecast) + # + # # Test 4b: find_outliers, season_add_mult = 'both' + # + # df1 = pd.DataFrame({'y': np.arange(0, 10.)+10*array_ones_in_indices(10,[5,6])}, + # index=pd.date_range('2014-01-01', periods=10, freq='D')) + # dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend=[forecast_models.model_naive], + # extrapolate_years=10. / 365, find_outliers=True, season_add_mult='both') + # + # df_data = dict_result['data'] + # df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + # + # logger_info('df_metadata:', df_metadata) + # logger_info('df_optimize_info:', df_optimize_info) + # logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + # + # a_y_result = df_data.loc[df_data.model=='naive'].y.values + # logger_info('a_y_result:', a_y_result) + # self.assert_array_equal(a_y_result, + # np.concatenate([ + # np.array([0., 0., 1., 2., 3., 4., 4.,4.,7.,8., 9.,]), + # np.full(9, 9.) + # ] + # )) + # + # df_forecast = dict_result['forecast'] + # logger_info('df_forecast',df_forecast) + + # Test 5: Series with gap + + # df1 = ( + # pd.DataFrame({'y': np.arange(0, 10.), + # #'weight': array_zeros_in_indices(10, [5, 6]), + # 'date': pd.date_range('2014-01-01', periods=10, freq='D')}, + # ) + # + # ) + # + # df1 = pd.concat([df1.head(5), df1.tail(3)], sort=False, ignore_index=False).pipe(normalize_df) + # + # dict_result = run_forecast(simplify_output=False, df_y=df1, + # l_model_trend=[], + # l_model_naive=[forecast_models.model_naive, forecast_models.model_snaive_wday], + # extrapolate_years=10. / 365, + # season_add_mult='both') + # + # + # df_data = dict_result['data'] + # df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + # + # logger_info('df_metadata:', df_metadata) + # logger_info('df_optimize_info:', df_optimize_info) + # logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + # + # a_y_result = df_data.loc[df_data.model == 'naive'].y.values + # logger_info('a_y_result:', a_y_result) + # self.assert_array_equal(a_y_result, + # np.concatenate([ + # np.array([0., 0., 1., 2., 3., 4., 4., 4., 7., 8., 9., ]), + # np.full(9, 9.) + # ] + # )) + # + # df_forecast = dict_result['forecast'] + # logger_info('df_forecast', df_forecast) + + # Test 6: Series with spike, find_outliers=True, use model_snaive_wday + + df1 = ( + pd.DataFrame({'y': np.arange(0, 21.) + 10*array_ones_in_indices(21, 7), + #'weight': array_zeros_in_indices(10, [5, 6]), + 'date': pd.date_range('2014-01-01', periods=21, freq='D')}, + ) + + ) + + #array_ones_in_indices(n, l_indices) + + dict_result = run_forecast(simplify_output=False, df_y=df1, + l_model_trend=[], + l_model_season=[], + l_model_naive=[forecast_models.model_snaive_wday], + extrapolate_years=20. / 365, + season_add_mult='both', find_outliers=True) + + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + df_data['wday']=df_data.date.dt.weekday + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + + a_y_result = df_data.loc[df_data.model == 'snaive_wday'].y.values + logger_info('a_y_result:', a_y_result) + self.assert_array_equal(a_y_result, + np.array([0., 1., 2., 3., 4., 5., 6., 17., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 14., 15., 16., 17., 18., + 19., 20., 14., 15., 16., 17., 18., 19.]) + ) + + df_forecast = dict_result['forecast'] + logger_info('df_forecast', df_forecast) + + def test_run_forecast_naive2(self): + # Test 1: run forecast with naive model, find_outliers, season_add_mult = 'add', weekly samples + path_df_naive = os.path.join(base_folder, 'df_test_naive.csv') + df_test_naive = pd.read_csv(path_df_naive) + + l_season_yearly = [ + forecast_models.model_season_month, + # model_season_fourier_yearly, + forecast_models.model_null] + + l_season_weekly = [ # forecast_models.model_season_wday_2, + forecast_models.model_season_wday, forecast_models.model_null] + + dict_result = run_forecast(simplify_output=False, df_y=df_test_naive, + #l_model_trend=[forecast_models.model_naive], + l_model_naive=[forecast_models.model_naive], + l_season_yearly=l_season_yearly, + l_season_weekly=l_season_weekly, + extrapolate_years=75. / 365, find_outliers=True, season_add_mult='add') + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.loc[(df_data.date>'2017-12-01') & (df_data.date<'2018-02-01')]) + + a_y_result = df_data.loc[df_data.model == 'naive'].y.values + #logger_info('a_y_result:', a_y_result) + + df_forecast = dict_result['forecast'] + logger_info('df_forecast',df_forecast.loc[(df_forecast.date>'2017-12-01')& (df_forecast.date<'2018-02-01')]) + + # After first spike, naive forecast and actuals start matching, only if season_add_mult='both' + self.assertNotEqual(df_data.loc[(df_data.date == '2018-01-07') & (df_data.model=='naive')].y.iloc[0], + df_data.loc[(df_data.date == '2018-01-07') & (df_data.model == 'actuals')].y.iloc[0]) + + + # Test 2: run forecast with naive model, find_outliers, season_add_mult = 'both', weekly samples + #path_df_naive = os.path.join(base_folder, 'df_test_naive.csv') + #df_test_naive = pd.read_csv(path_df_naive) + + l_season_yearly = [ + forecast_models.model_season_month, + # model_season_fourier_yearly, + forecast_models.model_null] + + l_season_weekly = [ # forecast_models.model_season_wday_2, + forecast_models.model_season_wday, forecast_models.model_null] + + dict_result = run_forecast(simplify_output=False, df_y=df_test_naive, + #l_model_trend=[forecast_models.model_naive], + l_model_naive=[forecast_models.model_naive], + l_season_yearly=l_season_yearly, + l_season_weekly=l_season_weekly, + extrapolate_years=75. / 365, find_outliers=True, season_add_mult='both') + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.loc[(df_data.date>'2017-12-01') & (df_data.date<'2018-02-01')]) + + a_y_result = df_data.loc[df_data.model == 'naive'].y.values + #logger_info('a_y_result:', a_y_result) + + df_forecast = dict_result['forecast'] + logger_info('df_forecast',df_forecast.loc[(df_forecast.date>'2017-12-01')& (df_forecast.date<'2018-02-01')]) + + # After first spike, naive forecast and actuals start matching, only if season_add_mult='both' + self.assertNotEqual(df_data.loc[(df_data.date == '2018-01-07') & (df_data.model=='naive')].y.iloc[0], + df_data.loc[(df_data.date == '2018-01-07') & (df_data.model == 'actuals')].y.iloc[0]) + + + # Test 3 - multiple model_naive runs + path_df_naive = os.path.join(base_folder, 'df_test_naive.csv') + df_test_naive = pd.read_csv(path_df_naive) + + model_naive2 = forecast_models.ForecastModel('naive2', 0, forecast_models._f_model_naive) + + l_model_naive = [forecast_models.model_naive,model_naive2] + + dict_result = run_forecast(simplify_output=False, df_y=df_test_naive, + l_model_trend=[], + l_season_yearly=l_season_yearly, + l_season_weekly=l_season_weekly, + l_model_naive= l_model_naive, + extrapolate_years=75. / 365, find_outliers=True, season_add_mult='add', ) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.loc[(df_data.date>'2017-12-01') & (df_data.date<'2018-02-01')]) + + a_y_result = df_data.loc[df_data.model == 'naive'].y.values + #logger_info('a_y_result:', a_y_result) + + df_forecast = dict_result['forecast'] + logger_info('df_forecast',df_forecast.loc[(df_forecast.date>'2017-12-01')& (df_forecast.date<'2018-02-01')]) + + # After first spike, naive forecast and actuals start matching, only if season_add_mult='both' + self.assertNotEqual(df_data.loc[(df_data.date == '2018-01-07') & (df_data.model=='naive')].y.iloc[0], + df_data.loc[(df_data.date == '2018-01-07') & (df_data.model == 'actuals')].y.iloc[0]) + + def test_run_forecast_sparse_with_gaps(self): + df_test = pd.DataFrame({'date': pd.to_datetime(['2018-08-01', '2018-08-09']), 'y': [1., 2.]}) + df_out = run_forecast(df_test, extrapolate_years=1.0) + logger_info('df_out', df_out) + def test_run_forecast_output_options(self): + freq = 'D' + freq_short = freq[0:1] # Changes e.g. W-MON to W + freq_units_per_year = 52.0 if freq_short == 'W' else 365.0 # Todo: change to dict to support more frequencies + + df_y = pd.DataFrame({'y': np.full(100, 0.0)}, + index=pd.date_range('2014-01-01', periods=100, freq=freq)) + + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=[forecast_models.model_linear, forecast_models.model_constant], + l_model_season=None, df_y=df_y, date_start_actuals=None + ) + + logger.info('Testing run forecast - default settings') + + dict_result = run_l_forecast([conf1]) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + for include_all_fits in [False, True]: + logger.info('Testing run forecast - include_all_fits=%s', + include_all_fits) + + dict_result = run_l_forecast([conf1], + include_all_fits=include_all_fits) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + # TODO: ADD ASSERTS + + def test_run_forecast_step(self): + # Setup + freq = 'D' + df_y1 = pd.DataFrame({'y': 5 * [10.0] + 5 * [20.0]}, + index=pd.date_range('2014-01-01', periods=10, freq=freq)) + + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=[forecast_models.model_constant, + forecast_models.model_constant + forecast_models.model_step], + l_model_season=None, df_y=df_y1, weights_y_values=1.0, date_start_actuals=None + ) + + dict_result = run_l_forecast([conf1], include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + # Test 2 : 2 steps + + # Setup + freq = 'D' + df_y1 = pd.DataFrame({'y': [1., 1., 1., 1., 1., 1., 5., 5., 6., 6.]}, + index=pd.date_range('2014-01-01', periods=10, freq=freq)) + + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=[forecast_models.model_constant + forecast_models.model_two_steps], + l_model_season=None, df_y=df_y1, weights_y_values=1.0, date_start_actuals=None + ) + + dict_result = run_l_forecast([conf1], include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + def test_run_forecast_sigmoid_step(self): + # Setup + freq = 'D' + df_y1 = pd.DataFrame({'y': [10., 10.1, 10.2, 10.3, 10.4, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6]}, + index=pd.date_range('2014-01-01', periods=12, freq=freq)) + + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=[forecast_models.model_constant, + forecast_models.model_sigmoid_step, + forecast_models.model_constant + forecast_models.model_sigmoid_step, + forecast_models.model_linear + forecast_models.model_sigmoid_step, + forecast_models.model_linear * forecast_models.model_sigmoid_step], + l_model_season=None, df_y=df_y1, weights_y_values=1.0, date_start_actuals=None + ) + + dict_result = run_l_forecast([conf1], include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + # Same with negative step + df_y1 = pd.DataFrame({'y': [20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 10., 10.1, 10.2, 10.3, 10.4]}, + index=pd.date_range('2014-01-01', periods=12, freq=freq)) + + conf1 = ForecastInput( + source_id='source1', + l_model_trend=[forecast_models.model_constant, + forecast_models.model_sigmoid_step, + forecast_models.model_constant + forecast_models.model_sigmoid_step, + forecast_models.model_linear + forecast_models.model_sigmoid_step, + forecast_models.model_linear * forecast_models.model_sigmoid_step], + l_model_season=None, df_y=df_y1, weights_y_values=1.0, date_start_actuals=None + ) + + dict_result = run_l_forecast([conf1], include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + def test_run_forecast_fourier_yearly(self): + # Yearly sinusoidal function + + # With daily samples + length = 2 * 365 + # size will be +-10 +- uniform error + a_date = pd.date_range(start='2018-01-01', freq='D', periods=length) + a_y = (10 + np.random.uniform(low=0, high=1, size=length) + + 10 * (np.sin(np.linspace(-4 * np.pi, 4 * np.pi, length)))) + df_y = pd.DataFrame({'y': a_y}, index=a_date) + + conf = ForecastInput( + source_id='source', + l_model_trend=[ + forecast_models.model_constant, + forecast_models.model_season_fourier_yearly, + forecast_models.model_constant + + forecast_models.model_season_fourier_yearly], + l_model_season=[forecast_models.model_null], df_y=df_y, weights_y_values=1.0, date_start_actuals=None + ) + dict_result = run_l_forecast([conf], include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + df = df_data.loc[(df_data.model == 'a') | df_data.is_best_fit, + ['y', 'date', 'model']] + + df = df.pivot(values='y', columns='model', index='date') + if platform.system() != 'Darwin': # matplotlib tests don't work on mac # matplotlib tests don't work on mac + df.plot() + + length = 1 * 365 + # size will be +-10 +- uniform error + a_date = pd.date_range(start='2018-01-01', freq='D', periods=length) + a_y = (10 + np.random.uniform(low=0, high=1, size=length) + + 10 * (np.sin(np.linspace(-4 * np.pi, 4 * np.pi, length))) + + 5 * (np.cos(np.linspace(-6 * np.pi, 6 * np.pi, length)))) + df_y = pd.DataFrame({'y': a_y}, index=a_date) + + conf = ForecastInput( + source_id='source', + l_model_trend=[ + forecast_models.model_constant, + forecast_models.model_season_fourier_yearly, + forecast_models.model_constant + + forecast_models.model_season_fourier_yearly], + l_model_season=[forecast_models.model_null], df_y=df_y, weights_y_values=1.0, date_start_actuals=None + ) + dict_result = run_l_forecast([conf], include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + df = df_data.loc[(df_data.model == 'a') | df_data.is_best_fit, + ['y', 'date', 'model']] + + df = df.pivot(values='y', columns='model', index='date') + if platform.system() != 'Darwin': # matplotlib tests don't work on mac # matplotlib tests don't work on mac + df.plot() + # TODO find a better assertion test + pass + + def test_run_forecast_sigmoid(self): + # Input parameters + b_in = 100. + c_in = 40. + d_in = 1. + # linear params + a_lin = 0.01 + b_lin = 0.05 + + is_mult_l = [False, True] + + def sigmoid(x, a, b, c, d): + y = a + (b - a) / (1 + np.exp(- d * (x - c))) + return y + + a_x = np.arange(1, 100) + # linear to find + + for is_mult in is_mult_l: + + if is_mult: + a_in = 1 + model = forecast_models.model_linear * forecast_models.model_sigmoid + y_lin = a_lin * a_x + b_lin + y_in = sigmoid(a_x, a_in, b_in, c_in, d_in) * y_lin + input_params = [a_lin, b_lin] + y_rand = np.random.uniform(low=0.001, high=0.1 * b_in, size=len(a_x)) * y_lin + else: + a_in = 30 # the constant + model = forecast_models.model_constant + forecast_models.model_sigmoid + y_in = sigmoid(a_x, a_in, b_in, c_in, d_in) + input_params = [a_in] + y_rand = np.random.uniform(low=0.001, high=0.1 * b_in, size=len(a_x)) + + input_params = input_params + [b_in - a_in, c_in, d_in] + + y_in = y_rand + y_in + df_y = pd.DataFrame({'y': y_in}, index=a_x) + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=[ + forecast_models.model_constant, + # forecast_models.model_sigmoid, + model, + # forecast_models.model_linear + forecast_models.model_sigmoid, + # forecast_models.model_linear * forecast_models.model_sigmoid + ], + l_model_season=None, df_y=df_y, weights_y_values=1.0, date_start_actuals=None + ) + + dict_result = run_l_forecast([conf1], + include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + # df_optimize_info = dict_result['optimize_info'] + + df = df_data.loc[:, ['y', 'date', 'model']] + + df = df.pivot(values='y', columns='model', index='date') + if platform.system() != 'Darwin': # matplotlib tests don't work on mac # matplotlib tests don't work on mac + df.plot() + output_params = df_metadata.loc[df_metadata.is_best_fit, 'params_str'] + logger.info('Input parameters: %s, Output parameters: %s', + input_params, output_params.iloc[0]) + pass # to see the plot + + def test_auto_find_sigmoid_step(self): + # Setup + + # First do it manually + freq = 'D' + a_y = [19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 10.1, 10.2, 10.3, 10.4, + 10.5, 10.6, 10.7, 10.8, 10.9] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df_y = pd.DataFrame({'y': a_y}, index=a_date) + a_x = np.arange(0, len(a_y)) + + steps, spikes = forecast_models.find_steps_and_spikes(a_x, a_y, a_date) + assert len(steps) == 1 + assert len(spikes) == 0 + step_model = steps[0] + trend_models = [forecast_models.model_linear + step_model, + forecast_models.model_linear + forecast_models.model_sigmoid_step, + forecast_models.model_linear] + + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=trend_models, + l_model_season=None, + df_y=df_y, + weights_y_values=1.0, + date_start_actuals=None + ) + + dict_result = run_l_forecast([conf1], + include_all_fits=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + # Then do it automatically + trend_models = [forecast_models.model_linear] + + # SolverConfig with trend + conf2 = ForecastInput( + source_id='source1', + l_model_trend=trend_models, + l_model_season=None, + df_y=df_y, + weights_y_values=1.0, + date_start_actuals=None + ) + + dict_result = run_l_forecast([conf2], + include_all_fits=True, do_find_steps_and_spikes=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + # Two changes + a_y = np.concatenate((np.arange(-1, 31), [50], np.arange(51, 70), [0], np.arange(1, 30)), + axis=0) + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df_y = pd.DataFrame({'y': a_y}, index=a_date) + trend_models = [forecast_models.model_linear] + + # SolverConfig with trend + conf3 = ForecastInput( + source_id='source1', + l_model_trend=trend_models, + l_model_season=None, + df_y=df_y, + weights_y_values=1.0, + date_start_actuals=None + ) + + dict_result = run_l_forecast([conf3], + include_all_fits=True, do_find_steps_and_spikes=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + def test_auto_find_sigmoid_spike(self): + # Setup + + # First do it manually + freq = 'D' + a_y = np.concatenate((np.arange(-1, 30), [50, 51], np.arange(31, 50)), + axis=0) + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df_y = pd.DataFrame({'y': a_y}, index=a_date).pipe(normalize_df) + a_x = np.arange(0, len(a_y)) + + steps, spikes = forecast_models.find_steps_and_spikes(a_x, a_y, a_date) + assert len(steps) == 0 + assert len(spikes) == 1 + spike_model = spikes[0] + trend_models = [forecast_models.model_linear * spike_model, + forecast_models.model_linear] + + # SolverConfig with trend + conf1 = ForecastInput( + source_id='source1', + l_model_trend=trend_models, + l_model_season=None, + df_y=df_y, + weights_y_values=1.0, + date_start_actuals=None + ) + + dict_result = run_l_forecast([conf1], + include_all_fits=True, do_find_steps_and_spikes=False) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + # Same automatically + trend_models = [forecast_models.model_linear] + # SolverConfig with trend + conf3 = ForecastInput( + source_id='source1', + l_model_trend=trend_models, + l_model_season=None, + df_y=df_y, + weights_y_values=1.0, + date_start_actuals=None + ) + + dict_result = run_l_forecast([conf3], + include_all_fits=True, + do_find_steps_and_spikes=True) + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(1)) + + def test_run_forecast_get_outliers(self): + + # Test 1 - no outliers + a_y = [20.0, 20.1, 20.2, 20.3, 20.4, 20.5] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y}) + + dict_result = run_forecast(df, find_outliers=True, simplify_output=False, include_all_fits=True, + season_add_mult='add') + logger_info('Metadata', dict_result['metadata']) + logger_info('data', dict_result['data'].tail(3)) + + # Check that dtype of y is not corrupted by None values from weight mask - this happens when no spikes found + self.assertTrue(np.issubdtype(dict_result['data'].y, np.float64)) + + # Test 2 - Single step + a_y = [19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 10.1, 10.2, 10.3, 10.4, + 10.5, 10.6, 10.7, 10.8, 10.9] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y}) + + dict_result = run_forecast(df, find_outliers=True, simplify_output=False, include_all_fits=True, + season_add_mult='add') + logger_info('Metadata', dict_result['metadata']) + logger_info('data', dict_result['data'].tail(3)) + + # Check that dtype of y is not corrupted by None values from weight mask - this happens when no spikes found + self.assertTrue(np.issubdtype(dict_result['data'].y, np.float64)) + + # Test 3 - Single spike + + a_y = [19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 20.7, 20.8, 20.9, 21.0, + 21.1, 21.2, 21.3, 21.4, 21.5] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df_spike = pd.DataFrame({'y': a_y}) + + dict_result = run_forecast(df_spike, find_outliers=True, + simplify_output=False, include_all_fits=True, + season_add_mult='add') + df_data = dict_result['data'] + mask = df_data.loc[df_data.model == 'actuals'].weight + self.assert_array_equal(mask, [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ]) + + # Test 5 - 2 spikes and 1 step + a_y = [19.8, 19.9, 30.0, 30.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 10.1, 10.2, 10.3, 10.4, + 10.5, 10.6, 30.7, 10.8, 10.9] + + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + dict_result = run_forecast(df, find_outliers=True, simplify_output=False, include_all_fits=True, + season_add_mult='add') + logger_info('Metadata', dict_result['metadata']) + df_result = dict_result['data'] + logger_info('data', df_result.tail(3)) + mask = df_result.loc[df_result.model=='actuals'].weight + self.assert_array_equal(mask, [1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]) + + def test_run_forecast_auto_season(self): + # Yearly sinusoidal function + + # With daily samples + length = 2 * 365 + # size will be +-10 +- uniform error + a_date = pd.date_range(start='2018-01-01', freq='D', periods=length) + a_y = (10 + np.random.uniform(low=0, high=1, size=length) + + 10 * (np.sin(np.linspace(-4 * np.pi, 4 * np.pi, length)))) + df_y = pd.DataFrame({'y': a_y}, index=a_date) + + dict_result = run_forecast(df_y, season_add_mult='add', simplify_output=False, include_all_fits=True, + l_model_trend=[forecast_models.model_linear]) + df_metadata = dict_result['metadata'] + + l_model_expected = ['linear', '(linear+(season_wday+season_fourier_yearly))', + '(linear+season_wday)', '(linear+season_fourier_yearly)'] + + self.assert_array_equal(df_metadata.model, l_model_expected) + logger_info('df_metadata:', df_metadata) + + # As above, with additive and multiplicative seasonality + + dict_result = run_forecast(df_y, season_add_mult='both', simplify_output=False, include_all_fits=True, + l_model_trend=[forecast_models.model_linear]) + df_metadata = dict_result['metadata'] + + l_model_expected = [ + '(linear*(season_wday*season_fourier_yearly))', + '(linear*season_fourier_yearly)', + '(linear*season_wday)', + '(linear+(season_wday+season_fourier_yearly))', + '(linear+season_fourier_yearly)', + '(linear+season_wday)', + 'linear' ] + + self.assert_array_equal(df_metadata.model.values, l_model_expected) + logger_info('df_metadata:', df_metadata) + + def test_run_forecast_with_weight(self): + df1 = pd.DataFrame({'y': np.arange(0, 10.), + 'date': pd.date_range('2014-01-01', periods=10, freq='D'), + 'weight': 1.}) + dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend = [forecast_models.model_linear], + extrapolate_years=10./365) + + df_forecast = dict_result['forecast'] + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_forecast:', df_forecast.groupby(['source', 'model']).tail(30)) + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(30)) + + df_forecast_filtered = df_forecast.loc[~df_forecast.is_actuals & (df_forecast.date>'2014-01-10')] + self.assert_series_equal(df_forecast_filtered.y, df_forecast_filtered.q5) + + df1b = df1.copy() + df1b.loc[0,'weight']=0. + + + + dict_result = run_forecast(simplify_output=False, df_y=df1b, l_model_trend = [forecast_models.model_linear], + extrapolate_years=10./365) + + df_forecast = dict_result['forecast'] + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_forecast:', df_forecast.groupby(['source', 'model']).tail(30)) + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(30)) + + len_forecast=df_data.loc[~df_data.is_actuals].index.size + self.assertEquals(len_forecast,19) # First sample shouldn't be included due to weight=0 + + # Since fit is perfect, prediction interval should be equal to point forecast + df_forecast_filtered = df_forecast.loc[~df_forecast.is_actuals & (df_forecast.date>'2014-01-10')] + self.assert_series_equal(df_forecast_filtered.y, df_forecast_filtered.q5) + + # Test with model_ramp + # Param A of model_ramp needs to be within the 15-85 percentile of valid x values + # Before a bugfix, we would get initial guesses of A=2, with boundaries (5.6, 8.4) + # Note: somehow validate bounds doesn't catch this! + + df1c = df1.copy() + df1c.loc[0:4, 'weight'] = 0. + + dict_result = run_forecast(simplify_output=False, df_y=df1c, l_model_trend=[forecast_models.model_ramp], + extrapolate_years=10. / 365) + + df_forecast = dict_result['forecast'] + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_forecast:', df_forecast.groupby(['source', 'model']).tail(30)) + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(30)) + + len_forecast=df_data.loc[~df_data.is_actuals].index.size + self.assertEquals(len_forecast,15) # First 5 samples shouldn't be included due to weight=0 + + # # Since fit is perfect, prediction interval should be equal to point forecast + # df_forecast_filtered = df_forecast.loc[~df_forecast.is_actuals & (df_forecast.date>'2014-01-10')] + # self.assert_series_equal(df_forecast_filtered.y, df_forecast_filtered.q5) + + + def test_detect_freq(self): + + # Initial test - what happens with single sample input? + a_date = pd.a_date = pd.date_range('2014-01-01', periods=1, freq='H') + result = detect_freq(a_date) + #self.assertEquals(result, 'H') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=24*7, freq='H') + result = detect_freq(a_date) + self.assertEquals(result, 'H') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 365, freq='D') + result = detect_freq(a_date) + self.assertEquals(result, 'D') + + l_freq_wday = ['W-MON', 'W-TUE', 'W-WED', 'W-THU', 'W-FRI', 'W-SAT', 'W-SUN'] + for freq_wday in l_freq_wday: + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 52, freq=freq_wday) + result = detect_freq(a_date) + self.assertEquals(result, freq_wday) + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='M') + result = detect_freq(a_date) + self.assertEquals(result, 'M') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='MS') + result = detect_freq(a_date) + self.assertEquals(result, 'MS') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='Q') + result = detect_freq(a_date) + self.assertEquals(result, 'Q') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='Y') + result = detect_freq(a_date) + self.assertEquals(result, 'Y') + + # Test with input dataframe + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=24 * 7, freq='H') + df_y = pd.DataFrame({'date': a_date}) + result = detect_freq(df_y) + self.assertEquals(result, 'H') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 365, freq='D') + df_y = pd.DataFrame({'date': a_date}) + result = detect_freq(df_y) + self.assertEquals(result, 'D') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='M') + df_y = pd.DataFrame({'date': a_date}) + result = detect_freq(df_y) + self.assertEquals(result, 'M') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='Q') + df_y = pd.DataFrame({'date': a_date}) + result = detect_freq(df_y) + self.assertEquals(result, 'Q') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='Y') + df_y = pd.DataFrame({'date': a_date}) + result = detect_freq(df_y) + self.assertEquals(result, 'Y') + + a_date = pd.a_date = pd.date_range('2014-01-01', periods=4 * 12, freq='YS') + df_y = pd.DataFrame({'date': a_date}) + result = detect_freq(df_y) + self.assertEquals(result, 'YS') + + # Test with sparse input series + a_date = pd.to_datetime(['2018-08-01', '2018-08-09']) + df_y = pd.DataFrame({'date':a_date}) + result = detect_freq(df_y) + self.assertEquals(result, 'D') + + # TODO: ADD TEST WITH NULL VALUES, E.G. MODEL_NAIVE_WDAY + def test_get_pi(self): + + def check_result(df_result): + self.assertTrue('q5' in df_result.columns) + df_result_actuals = df_result.loc[df_result.is_actuals] + if 'is_weight' in df_result_actuals.columns: + df_result_actuals = df_result_actuals.loc[~df_result_actuals.is_weight] + date_max_actuals = df_result_actuals.date.max() + logger_info('debug: date max actuals', date_max_actuals) + + df_result_forecast = df_result.loc[~df_result.is_actuals & (df_result.date > date_max_actuals)] + self.assertFalse(df_result_forecast.q5.isnull().any()) + + # First test with single source + # then test applied function on df grouped by source + + a_date_actuals = pd.date_range('2014-01-01', periods=10, freq='W') + a_y_actuals = np.arange(0, 10.) + df_actuals = ( + pd.DataFrame({'date': a_date_actuals, 'y': a_y_actuals, + 'source': 's1', 'is_actuals': True, 'is_best_fit': False, 'model': 'actuals'}) + ) + + a_date = pd.date_range('2014-01-01', periods=20, freq='W') + a_y = np.arange(0, 20.) + (np.tile([-1, 1], (10)) * np.arange(2, 0., -0.1)) + + df_fcast = ( + pd.DataFrame({'date': a_date, 'y': a_y, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear'}) + ) + + df1 = pd.concat([df_actuals, df_fcast], ignore_index=True, sort=False) + + df_result = get_pi(df1, n=100) + df_result0 = df_result + # logger_info('df_result1:', df_result1) + logger_info('df_result1:', df_result.groupby(['source', 'model']).head(1)) + logger_info('df_result1:', df_result.groupby(['source', 'model']).tail(1)) + # TODO: Add checks + check_result(df_result) + + # Test 1b - input dataframe without is_best_fit column, source column + df1c = df1[['date', 'is_actuals', 'model', 'y']] + df_result = get_pi(df1c, n=100) + # logger_info('df_result1:', df_result1) + logger_info('df_result1:', df_result.groupby(['model']).head(1)) + logger_info('df_result1:', df_result.groupby(['model']).tail(1)) + + check_result(df_result) + + # Test 2 - 2 sources + + df1b = df1.copy() + df1b.source = 's2' + df2 = pd.concat([df1, df1b], sort=False) + + # df_result2 = df2.groupby('source').apply(get_pi, n=100).reset_index(drop=True) + df_result = get_pi(df2, n=100) + # logger_info('df_result2:', df_result2) + logger_info('df_result2:', df_result.groupby(['source', 'model']).head(1)) + logger_info('df_result2:', df_result.groupby(['source', 'model']).tail(1)) + # TODO: Add checks + + check_result(df_result) + + # Test 3 - Input has actuals but no forecast - can happen if fit not possible + + df3 = df_actuals + df_result = get_pi(df3, n=100) + self.assertIsNotNone(df3) + self.assertFalse('q5' in df_result.columns) + # logger_info('df_result1:', df_result1) + logger_info('df_result3:', df_result.groupby(['source', 'model']).head(1)) + logger_info('df_result3:', df_result.groupby(['source', 'model']).tail(1)) + # + # Test 4 - Input has null values at the end + a_date_actuals = pd.date_range('2014-01-01', periods=10, freq='W') + a_y_actuals = np.arange(0, 10.) + df_actuals = ( + pd.DataFrame({'date': a_date_actuals, 'y': a_y_actuals, + 'source': 's1', 'is_actuals': True, 'is_best_fit': False, 'model': 'actuals'}) + ) + + a_date = pd.date_range('2014-01-01', periods=20, freq='W') + a_y = np.arange(0, 20.) + (np.tile([-1, 1], (10)) * np.arange(2, 0., -0.1)) + + df_fcast = ( + pd.DataFrame({'date': a_date, 'y': a_y, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear'}) + ) + + df1 = pd.concat([df_actuals, df_fcast], ignore_index=True, sort=False) + df_result = get_pi(df1, n=100) + + a_date_actuals_withnull = pd.date_range('2014-01-01', periods=20, freq='W') + a_y_actuals_withnull = np.concatenate([np.arange(0, 10.), np.full(10, np.NaN)]) + df_actuals_withnull = ( + pd.DataFrame({'date': a_date_actuals, 'y': a_y_actuals, + 'source': 's1', 'is_actuals': True, 'is_best_fit': False, 'model': 'actuals'}) + ) + + a_date_withnull = pd.date_range('2014-01-01', periods=20, freq='W') + + df1_withnull = pd.concat([df_actuals_withnull, df_fcast], ignore_index=True, sort=False) + df_result_withnull = get_pi(df1_withnull, n=100) + + logger_info('df_result:', df_result.groupby(['source', 'model']).tail(3)) + logger_info('df_result with null:', df_result_withnull.groupby(['source', 'model']).tail(3)) + # Prediction intervals are random, so we need to exclude them from comparison + self.assert_frame_equal(df_result[['date', 'source', 'is_actuals', 'model', 'y']], + df_result_withnull[['date', 'source', 'is_actuals', 'model', 'y']]) + + # Test 4b - Input with null values at the end, weight column + df_weight = ( + pd.DataFrame({'date': a_date, 'y': 1, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear', + 'is_weight': True}) + ) + df_weight_withnull = ( + pd.DataFrame({'date': a_date_withnull, 'y': 1, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear', + 'is_weight': True}) + ) + + df1['is_weight'] = False + df1_withnull['is_weight'] = False + + df1b = pd.concat([df1, df_weight], ignore_index=True, sort=False) + df1b_withnull = pd.concat([df1_withnull, df_weight_withnull], ignore_index=True, sort=False) + + df_result_b = get_pi(df1b, n=100) + df_result_b_withnull = get_pi(df1b_withnull, n=100) + + logger_info('df_result b :', df_result_b.groupby(['source', 'model']).tail(3)) + logger_info('df_result b with null:', df_result_b_withnull.groupby(['source', 'model']).tail(3)) + # Prediction intervals are random, so we need to exclude them from comparison + self.assert_frame_equal(df_result_b[['date', 'source', 'is_actuals', 'model', 'y']], + df_result_b_withnull[['date', 'source', 'is_actuals', 'model', 'y']]) + + check_result(df_result_b) + check_result(df_result_b_withnull) + + # Test 4C - Input has null values at the start of actuals series + a_date_actuals = pd.date_range('2014-01-01', periods=10, freq='W') + a_y_actuals = np.arange(0, 10.) + df_actuals = ( + pd.DataFrame({'date': a_date_actuals, 'y': a_y_actuals, + 'source': 's1', 'is_actuals': True, 'is_best_fit': False, 'model': 'actuals'}) + ) + + a_date = pd.date_range('2014-01-01', periods=20, freq='W') + a_y = np.arange(0, 20.) + (np.tile([-1, 1], (10)) * np.arange(2, 0., -0.1)) + + df_fcast = ( + pd.DataFrame({'date': a_date, 'y': a_y, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear'}) + ) + + df1 = pd.concat([df_actuals, df_fcast], ignore_index=True, sort=False) + df_result = get_pi(df1, n=100) + + a_date_actuals_withnull = pd.date_range('2014-01-01', periods=10, freq='W') + a_y_actuals_withnull = np.concatenate([np.full(5, np.NaN),np.arange(0, 5.)]) + df_actuals_withnull = ( + pd.DataFrame({'date': a_date_actuals_withnull, 'y': a_y_actuals_withnull, + 'source': 's1', 'is_actuals': True, 'is_best_fit': False, 'model': 'actuals'}) + ) + + a_date_withnull = pd.date_range('2014-01-01', periods=10, freq='W') + + df1_withnull = pd.concat([df_actuals_withnull, df_fcast], ignore_index=True, sort=False) + df_result_withnull = get_pi(df1_withnull, n=100) + + logger_info('df_actuals_withnull:', df_actuals_withnull.groupby(['source', 'model']).head(20)) + logger_info('df_result:', df_result.groupby(['source', 'model']).tail(3)) + logger_info('df_result with null:', df_result_withnull.groupby(['source', 'model']).tail(100)) + # todo - add proper expected value, uncomment assert + # self.assert_frame_equal(df_result[['date', 'source', 'is_actuals', 'model', 'y']], + # df_result_withnull[['date', 'source', 'is_actuals', 'model', 'y']]) + + + # Test 4D - Input has null values at the start of actuals series + a_date_actuals = pd.date_range('2014-01-01', periods=10, freq='W') + a_y_actuals = np.arange(0, 10.) + df_actuals = ( + pd.DataFrame({'date': a_date_actuals, 'y': a_y_actuals, + 'source': 's1', 'is_actuals': True, 'is_best_fit': False, 'model': 'actuals'}) + ) + + a_date = pd.date_range('2014-01-01', periods=20, freq='W') + a_y = np.arange(0, 20.) + (np.tile([-1, 1], (10)) * np.arange(2, 0., -0.1)) + a_y_withnull = np.concatenate([np.full(5,np.NaN),np.arange(0,15.),]) + + df_fcast = ( + pd.DataFrame({'date': a_date, 'y': a_y, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear'}) + ) + + df_fcast_withnull = ( + pd.DataFrame({'date': a_date, 'y': a_y_withnull, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear'}) + ) + + df1 = pd.concat([df_actuals, df_fcast], ignore_index=True, sort=False) + df_result = get_pi(df1, n=100) + + + df1_withnull = pd.concat([df_actuals, df_fcast_withnull], ignore_index=True, sort=False) + df_result_withnull = get_pi(df1_withnull, n=100) + + logger_info('df_fcast_withnull:', df_fcast_withnull.groupby(['source', 'model']).head(20)) + logger_info('df_result:', df_result.groupby(['source', 'model']).tail(100)) + logger_info('df_result with null:', df_result_withnull.groupby(['source', 'model']).tail(100)) + # Prediction intervals are random, so we need to exclude them from comparison + # self.assert_frame_equal(df_result[['date', 'source', 'is_actuals', 'model', 'y']], + # df_result_withnull[['date', 'source', 'is_actuals', 'model', 'y']]) + # TODO: ADD VALID CHECK - + + + def test_get_pi_gap(self): + def check_result(df_result): + self.assertTrue('q5' in df_result.columns) + + # Test 1 - Input has gaps + + a_date_actuals = pd.date_range('2014-01-01', periods=10, freq='W') + a_y_actuals = np.arange(0, 10.) + df_actuals = ( + pd.DataFrame({'date': a_date_actuals, 'y': a_y_actuals, + 'source': 's1', 'is_actuals': True, 'is_best_fit': False, 'model': 'actuals'}) + ) + + a_date = pd.date_range('2014-01-01', periods=20, freq='W') + a_y = np.arange(0, 20.) + (np.tile([-1, 1], (10)) * np.arange(2, 0., -0.1)) + + df_fcast = ( + pd.DataFrame({'date': a_date, 'y': a_y, + 'source': 's1', 'is_actuals': False, 'is_best_fit': True, 'model': 'linear'}) + ) + + df_actuals_gap = pd.concat([df_actuals.head(3), df_actuals.tail(3)]) + + df = pd.concat([df_actuals_gap, df_fcast], ignore_index=True, sort=False) + + df_result = get_pi(df, n=100) + # logger_info('df_result1:', df_result1) + logger_info('df_result1:', df_result.groupby(['source', 'model']).head(2)) + logger_info('df_result1:', df_result.groupby(['source', 'model']).tail(2)) + + check_result(df_result) + + # Test 2 - Input has nulls + + df_actuals_null = df_actuals.copy() + df_actuals_null.loc[5, 'y'] = np.NaN + + logger_info('df_actuals_null:', df_actuals_null) + + df = pd.concat([df_actuals_null, df_fcast], ignore_index=True, sort=False) + + df_result = get_pi(df, n=100) + # logger_info('df_result1:', df_result1) + logger_info('df_result2:', df_result.groupby(['source', 'model']).head(20)) + logger_info('df_result2:', df_result.groupby(['source', 'model']).tail(20)) + + self.assertFalse(df_result.loc[df_result.date > df_actuals.date.max()].q5.isnull().any()) + + check_result(df_result) + + def test_forecast_pi_missing(self): + path_candy = os.path.join(base_folder, 'candy_production.csv') + df_monthly_candy = pd.read_csv(path_candy) + dict_result = run_forecast(df_monthly_candy, + col_name_y='IPG3113N', + col_name_date='observation_date', extrapolate_years=2, + simplify_output=False) + + df_fcast = dict_result.get('forecast') + logger_info('df_fcast: ', df_fcast.tail()) + + self.assertIn('q5', df_fcast.columns) + + def test_run_forecast_yearly_model(self): + df1 = pd.DataFrame({'y': np.arange(0, 10.), 'date': pd.date_range('2000-01-01', periods=10, freq='YS')}) + dict_result = run_forecast(simplify_output=False, df_y=df1, l_model_trend=[forecast_models.model_linear], + extrapolate_years=10.) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(30)) + + # Repeat test - 2 sources + + df1a = df1.copy() + df1b = df1.copy() + df1a['source'] = 'src1' + df1b['source'] = 'src2' + df2 = pd.concat([df1a, df1b], sort=False, ignore_index=True) + + logger_info('df input:', df2) + + dict_result = run_forecast(simplify_output=False, df_y=df2, l_model_trend=[forecast_models.model_linear], + extrapolate_years=10.) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + + # Same, with simplify_output=True + + df_result = run_forecast(simplify_output=True, df_y=df2, l_model_trend=[forecast_models.model_linear], + extrapolate_years=10.) + logger_info('df_result:', df_result) diff --git a/tests/test_forecast_model.py b/tests/test_forecast_model.py new file mode 100644 index 0000000..f5dbe06 --- /dev/null +++ b/tests/test_forecast_model.py @@ -0,0 +1,900 @@ +""" + +Author: Pedro Capelastegui +Created on 04/12/2015 +""" + +import logging +import unittest +import itertools +import pandas as pd, numpy as np +from argparse import Namespace + +import numpy as np +import pandas as pd +from unittest import TestCase +from anticipy.utils_test import PandasTest +from anticipy.forecast_models import * +from anticipy.forecast import normalize_df +from anticipy.model_utils import interpolate_df + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def logger_info(msg, data): + logger.info(msg + '\n%s', data) + + +pd.set_option('display.max_columns', 40) +pd.set_option('display.max_rows', 200) +pd.set_option('display.width', 1000) + + +def get_initial_guess(f_model, t_values): + return f_model(t_values, None, None, get_aic_k=False) + +def array_ones_in_indices(n, l_indices): + return np.isin(np.arange(0,n),l_indices).astype(float) + +def array_zeros_in_indices(n, l_indices): + return (~np.isin(np.arange(0,n),l_indices)).astype(float) + +class TestForecastModel(PandasTest): + def setUp(self): + pass + + def test_model_naive(self): + a_x = np.arange(0, 10) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + a_y = 10*a_x + df_actuals = pd.DataFrame({'date':a_date,'x':a_x,'y':a_y}).head() + + a_y_result = model_naive(a_x, a_date, None, df_actuals=df_actuals) + logger_info('a_y result: ', a_y_result) + a_y_expected = np.array([0., 0., 10, 20., 30., 40, 40., 40., 40., 40.,]) + self.assert_array_equal(a_y_result, a_y_expected) + + # TODO: model composition disabled, check that exception is thrown + # # Model composition + # a_params = np.array([1.,0.,]) + # a_y_result = (model_naive + model_linear) (a_x, a_date, a_params, df_actuals=df_actuals) + # logger_info('a_y result: ', a_y_result) + + def test_model_snaive_wday(self): + a_x = np.arange(0, 21) + a_date = pd.date_range('2014-01-01', periods=21, freq='D') + a_y = 10.*a_x + df_actuals = pd.DataFrame({'date':a_date,'x':a_x,'y':a_y}).head(7) + + a_y_result = model_snaive_wday(a_x, a_date, None, df_actuals=df_actuals) + logger_info('a_y result: ', a_y_result) + a_y_expected = np.array([np.NaN]*7+np.arange(0,70., 10.).tolist()*2) + self.assert_array_equal(a_y_result, a_y_expected) + + # TODO: model composition disabled, check that exception is thrown + # # Model composition + # a_params = np.array([1.,0.,]) + # a_y_result = (model_naive + model_linear) (a_x, a_date, a_params, df_actuals=df_actuals) + # logger_info('a_y result: ', a_y_result) + + def test_forecast_model_simple_models(self): + # TODO: test all models with is_mult True and False + + a_x = np.arange(0, 10) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + + def test_model(name, model, params, a_expected, l_is_mult=None, a_date=a_date, a_x = a_x): + if l_is_mult is None: + l_is_mult = [True, False] + for is_mult in l_is_mult: + params = np.array(params) + a = model(a_x, a_date, params, is_mult) + logger_info('a {}, is_mult={} :'.format(name, is_mult), a) + self.assert_array_equal(a, a_expected) + # Test init params + params = model.f_init_params(None, None, None) + self.assertIsInstance(params, np.ndarray) + bounds = model.f_bounds(None, None, None) + logger.info('params: %s', params) + self.assertTrue(validate_initial_guess(params, bounds)) + params = model.f_init_params(a_x, None, a_x) + self.assertIsInstance(params, np.ndarray) + bounds = model.f_bounds(a_x, None, a_x) + logger.info('a_x: %s', a_x) + logger.info('params: %s', params) + logger.info('bounds: %s', bounds) + self.assertTrue(validate_initial_guess(params, bounds)) + params = model.f_init_params(None, a_x, a_date) + self.assertIsInstance(params, np.ndarray) + bounds = model.f_bounds(None, a_x, a_x) + logger.info('a_x: %s', a_x) + logger.info('params: %s', params) + logger.info('bounds: %s', bounds) + self.assertTrue(validate_initial_guess(params, bounds)) + params = model.f_init_params(a_x, a_x, a_date) + self.assertIsInstance(params, np.ndarray) + bounds = model.f_bounds(a_x, a_x, a_x) + logger.info('params: %s', params) + self.assertTrue(validate_initial_guess(params, bounds)) + + test_model('constant', model_constant, [42], + np.full(10, 42.)) + + test_model('linear', model_linear, [-1., 10], + np.arange(10., 0, -1)) + + test_model('ramp', model_ramp, [5., 1.], + np.concatenate([np.full(5, 0.), np.arange(0, 5.)]), [False]) + + test_model('ramp', model_ramp, [5., 1.], + np.concatenate([np.full(5, 1.), np.arange(1, 6.)]), [True]) + + test_model('exp', model_exp, [10., 2], + np.array([10., 20., 40., 80., 160., 320., 640., 1280., 2560., 5120.])) + + test_model('step', model_step, [5., 100.], + np.array(5 * [0.] + 5 * [100.]), [False]) + + test_model('step', model_step, [5., 100.], + np.array(5 * [1.] + 5 * [100.]), [True]) + + test_model('step_date', get_model_step_date('2014-01-06'), [100.], + np.array(5 * [0.] + 5 * [100.]), [False]) + + test_model('step_date', get_model_step_date('2014-01-06'), [100.], + np.array(5 * [1.] + 5 * [100.]), [True]) + + test_model('spike', model_spike, [10., 4., 6.], + np.array(4 * [0.] + 2 * [10.] + 4 * [0.]), [False]) + + test_model('spike', model_spike, [10., 4., 6.], + np.array(4 * [1.] + 2 * [10.] + 4 * [1.]), [True]) + + test_model('spike_date', get_model_spike_date('2014-01-05', '2014-01-07'), + [10.], + np.array(4 * [0.] + 2 * [10.] + 4 * [0.]), [False]) + + test_model('spike_date', get_model_spike_date('2014-01-05', '2014-01-07'), + [10.], + np.array(4 * [1.] + 2 * [10.] + 4 * [1.]), [True]) + + test_model('2 steps', model_two_steps, [5., 100., 7, 200.], + np.array(5 * [0.] + 2 * [100.] + 3 * [300.]), [False]) + + test_model('2 steps', model_two_steps, [5., 100., 7, 3.], + np.array(5 * [1.] + 2 * [100.] + 3 * [300.]), [True]) + + test_model('season_wday', model_season_wday, 10 * np.arange(1., 7.), + np.array([20., 30., 40., 50., 60., 0, 10., 20., 30., 40.]), [False]) + + test_model('season_wday', model_season_wday, 10 * np.arange(1., 7.), + np.array([20., 30., 40., 50., 60., 1, 10., 20., 30., 40.]), [True]) + + + a_x2 = np.arange(0, 12) + a_date2 = pd.date_range('2014-01-01', periods=12, freq='D') + + + test_model('season_month', model_season_month, 10*np.arange(2.,13.), + np.array([60., 70., 80., 90., 100, 110., 120., 0., 20., 30.,40.,50., ]), [False], + a_date= pd.date_range('2014-06-01', periods=12, freq='M'), a_x=a_x2) + + test_model('season_month', model_season_month, 10*np.arange(2.,13.), + np.array([60., 70., 80., 90., 100, 110., 120., 1., 20., 30., 40.,50.,]), [True], + a_date= pd.date_range('2014-06-01', periods=12, freq='M'), a_x=a_x2) + + test_model('season_fourier_yearly', model_season_month, 10*np.arange(2.,13.), + np.array([60., 70., 80., 90., 100, 110., 120., 1., 20., 30.,40.,50., ]), [True], + a_date= pd.date_range('2014-06-01', periods=12, freq='M'), a_x=a_x2) + + # test fourier model + from anticipy.forecast_models import _f_init_params_fourier + + for is_mult in [False, True]: + a_x = 10 * np.arange(2., 13.) + a_date = pd.date_range('2014-06-01', periods=10, freq='M') + params = _f_init_params_fourier() + a = model_season_fourier_yearly(a_x, a_date, params, is_mult) + logger_info('a {}, is_mult={} :'.format('model_season_fourier_yearly', is_mult), a) + + for is_mult in [False, True]: + a_x = 10 * np.arange(2., 13.) + a_date = pd.date_range('2014-06-01', periods=10, freq='M') + params = np.full(20, 1.) + a = model_season_fourier_yearly(a_x, a_date, params, is_mult) + logger_info('a {}, is_mult={} :'.format('model_season_fourier_yearly', is_mult), a) + + def test_forecast_model_composite(self): + a_x = np.arange(1, 11.) + a_y = np.arange(1, 11.) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + a_date_month = pd.date_range('2014-01-01', periods=10, freq='M') + + dict_model = { + 'constant': model_constant, + 'linear': model_linear, + 'ramp': model_ramp, + 'exp': model_exp, + 'season_wday': model_season_wday, + # TODO: ADD season_wday_2 + 'season_month': model_season_month, + 'step': model_step, + 'two_steps': model_two_steps, + } + dict_params = { + 'constant': np.array([1.]), + 'linear': np.array([1., 0.]), + 'ramp': np.array([6., 1.]), + 'exp': np.array([1., 2.]), + 'season_wday': np.arange(1., 7.), + 'season_month': np.arange(2., 13.), + 'step': np.array([6., 100.]), + 'two_steps': np.array([6., 100., 8, 200.]), + } + dict_expected_add = { + 'constant': np.full(10, 1.), + 'linear': np.arange(1., 11.), + 'ramp': np.concatenate([np.full(5, 0.), np.arange(0, 5.)]), + 'exp': 2 ** np.arange(1., 11.), + 'season_wday': np.arange(2., 12., ) % 7, + 'season_month': np.full(10, 0.), + 'step': np.array(5 * [0.] + 5 * [100.]), + 'two_steps': np.array(5 * [0.] + 2 * [100.] + 3 * [300.]), + } + dict_expected_mult = { + 'constant': np.full(10, 1.), + 'linear': np.arange(1., 11.), + 'ramp': np.concatenate([np.full(5, 1.), np.arange(1, 6.)]), + 'exp': 2 ** np.arange(1., 11.), + 'season_wday': np.array([2., 3., 4., 5., 6., 1., 1., 2., 3., 4., ]), + 'season_month': np.full(10, 1.), + 'step': np.array(5 * [1.] + 5 * [100.]), + 'two_steps': np.array(5 * [1.] + 2 * [100.] + 3 * [20000.]), + } + + def test_model_1(key): + model = dict_model[key] + initial_guess = model.f_init_params(a_x, a_y) + logger.info('Testing model %s - name: %s', key, model.name) + self.assert_array_equal(model(a_x, a_date, dict_params[key]), dict_expected_add[key]) + logger.info('Initial guess: %s', model.f_init_params(a_x, a_y)) + self.assertEquals(len(initial_guess), model.n_params) + + for key in dict_model.keys(): + test_model_1(key) + + def test_model_2_add(key1, key2): + model = dict_model[key1] + dict_model[key2] + initial_guess = model.f_init_params(a_x, a_y) + logger.info('Testing model %s, %s - name: %s', key1, key2, model.name) + logger.info('Parameters: %s , %s', dict_params[key1], dict_params[key2]) + logger.info('Initial guess: %s', initial_guess) + self.assertEquals(len(initial_guess), model.n_params) + model_output = model(a_x, a_date, + np.concatenate([dict_params[key1], dict_params[key2]])) + logger.info('Model output: %s', model_output) + self.assert_array_equal(model_output, + dict_expected_add[key1] + dict_expected_add[key2]) + + for key1, key2 in itertools.product(dict_model.keys(), dict_model.keys()): + logger.info('Keys: %s , %s', key1, key2) + test_model_2_add(key1, key2) + + def test_model_2_mult(key1, key2): + model = dict_model[key1] * dict_model[key2] + initial_guess = model.f_init_params(a_x, a_y) + logger.info('Testing model %s, %s - name: %s', key1, key2, model.name) + logger.info('Parameters: %s , %s', dict_params[key1], dict_params[key2]) + logger.info('Initial guess: %s', initial_guess) + self.assertEquals(len(initial_guess), model.n_params) + model_output = model(a_x, a_date, + np.concatenate([dict_params[key1], dict_params[key2]])) + logger.info('Model output: %s', model_output) + self.assert_array_equal(model_output, + dict_expected_mult[key1] * dict_expected_mult[key2]) + + for key1, key2 in itertools.product(dict_model.keys(), dict_model.keys()): + logger.info('Keys: %s , %s', key1, key2) + test_model_2_mult(key1, key2) + + def test_forecast_model_composite_null(self): + a_x = np.arange(0, 10.) + a_y = np.arange(0, 10.) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + a_date_month = pd.date_range('2014-01-01', periods=10, freq='M') + + dict_model = { + 'constant': model_constant, + 'linear': model_linear, + 'exp': model_exp, + 'season_wday': model_season_wday, + 'season_month': model_season_month, + } + + dict_params = { + 'constant': np.array([1.]), + 'linear': np.array([1., 0.]), + 'exp': np.array([1., 2.]), + 'season_wday': np.arange(1., 7.), + 'season_month': np.arange(1., 13.) + } + dict_expected = { + 'constant': np.full(10, 1.), + 'linear': np.arange(0., 10.), + 'exp': 2 ** np.arange(0., 10.), + 'season_wday': np.arange(2., 12., ) % 7, + 'season_month': np.full(10, 0.), + } + + def test_model_2_add_null(key1): + model = dict_model[key1] + model_null + initial_guess = model.f_init_params(a_x, a_y) + logger.info('Testing model %s, - name: %s', key1, model.name) + logger.info('Parameters: %s', dict_params[key1]) + logger.info('Initial guess: %s', initial_guess) + self.assertEquals(len(initial_guess), model.n_params) + self.assert_array_equal(model(a_x, a_date, + dict_params[key1]), + dict_expected[key1]) + + for key in dict_model.keys(): + test_model_2_add_null(key) + + def test_model_2_mult_null(key1): + model_original = dict_model[key1] + model = model_original * model_null + initial_guess = model.f_init_params(a_x, a_y) + logger.info('Testing model %s, - name: %s', key1, model.name) + logger.info('Parameters: %s', dict_params[key1]) + logger.info('Initial guess: %s', initial_guess) + self.assertEquals(model, model_original) + + for key in dict_model.keys(): + test_model_2_mult_null(key) + + def test_forecast_model_composite_3(self): + # Test composition of 3+ models + a_x = np.arange(0, 10.) + a_y = np.arange(0, 10.) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + a_date_month = pd.date_range('2014-01-01', periods=10, freq='M') + + dict_model = { + 'constant': model_constant, + 'linear': model_linear, + 'ramp': model_ramp, + 'exp': model_exp, + 'season_wday': model_season_wday, + 'season_month': model_season_month, + } + + dict_params = { + 'constant': np.array([1.]), + 'linear': np.array([1., 0.]), + 'ramp': np.array([6., 1.]), + 'exp': np.array([1., 2.]), + 'season_wday': np.arange(1., 7.), + 'season_month': np.arange(1., 13.) + } + dict_expected = { + 'constant': np.full(10, 1.), + 'linear': np.arange(0., 10.), + 'ramp': np.concatenate([np.full(5, 0.), np.arange(0, 5.)]), + 'exp': 2 ** np.arange(0., 10.), + 'season_wday': + # np.arange(2., 12., ) % 7, + np.array([2., 3., 4., 5., 6., 1., 1., 2., 3., 4.]), + 'season_month': np.full(10, 1.), + } + + def test_model_3(model, params, expected): + initial_guess = model.f_init_params(a_x, a_y) + logger.info('Testing model: %s', model.name) + logger.info('Parameters: %s', params) + logger.info('Initial guess: %s', initial_guess) + self.assertEquals(len(initial_guess), model.n_params) + self.assert_array_equal(model(a_x, a_date, params), + expected) + + test_model_3( + (model_linear * model_linear) + model_constant, + np.concatenate([dict_params['linear'], dict_params['linear'], dict_params['constant']]), + (dict_expected['linear'] * dict_expected['linear']) + dict_expected['constant'] + ) + + test_model_3( + model_linear * (model_linear + model_constant), + np.concatenate([dict_params['linear'], dict_params['linear'], dict_params['constant']]), + dict_expected['linear'] * (dict_expected['linear'] + dict_expected['constant']) + ) + + test_model_3( + (model_linear * model_season_wday) + model_constant, + np.concatenate([dict_params['linear'], dict_params['season_wday'], dict_params['constant']]), + (dict_expected['linear'] * dict_expected['season_wday']) + dict_expected['constant'] + ) + + def test_forecast_model_bounds(self): + + dict_model = { + 'constant': model_constant, + 'linear': model_linear, + 'exp': model_exp, + 'season_wday': model_season_wday, + 'season_month': model_season_month, + 'step': model_step, + 'two_steps': model_two_steps, + 'sigmoid_step': model_sigmoid_step, + 'ramp': model_ramp + } + dict_expected = dict() + for model_name, model_obj in dict_model.items(): + n_params = model_obj.n_params + exp = n_params * [-np.inf], n_params * [np.inf] + dict_expected[model_name] = exp + + # Manually set the boundaries here + dict_expected['sigmoid_step'] = ([-np.inf, -np.inf, 0.0], [np.inf, np.inf, np.inf]) + + def test_model_bounds(key, model, expected): + bounds = model.f_bounds() + params = model.n_params + logger.info('Testing model: %s', model.name) + logger.info('Bounds: %s', bounds) + logger.info('Expected: %s', expected) + self.assertEquals(params, len(bounds[0])) + self.assertTupleEqual(bounds, expected) + + for model_name, model_obj in dict_model.items(): + test_model_bounds(model_name, model_obj, dict_expected[model_name]) + + def test_get_model_outliers(self): + # TODO: change input dfs to normalized form, rather than call normalize_df + + # Test 1 - no outliers + a_y = [20.0, 20.1, 20.2, 20.3, 20.4, 20.5] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + mask_step, mask_spike = get_model_outliers(df) + logger_info('Model 1:', mask_step) + self.assertIsNone(mask_step) + + # 1b - with datetime index + df = pd.DataFrame({'y': a_y}, index=a_date).pipe(normalize_df) + + mask_step, mask_spike = get_model_outliers(df) + logger_info('Model 1b:', mask_step) + self.assertIsNone(mask_step) + + # Test 2 - Single step + a_y = np.array([19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 10.1, 10.2, 10.3, 10.4, + 10.5, 10.6, 10.7, 10.8, 10.9]) + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + + mask_step,mask_spike = get_model_outliers(df) + logger_info('Model 2:', mask_step) + self.assert_array_equal(mask_step, + array_ones_in_indices(a_y.size, 9)) + + # 2b - with date column + df = pd.DataFrame({'y': a_y}, index=a_date).pipe(normalize_df) + + mask_step, mask_spike = get_model_outliers(df) + logger_info('Model 2b:', mask_step) + self.assert_array_equal(mask_step, + array_ones_in_indices(a_y.size, 9)) + + # Test 3 - Two step changes + a_y = np.array([-1, 0, 1, 2, 3, 5, 6, 8, 10, 15, 16, 18, + 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, + 10., 10.1, 10.2, 10.3, 10.4]) + + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + + mask_step,mask_spike = get_model_outliers(df) + logger_info('Model 3:', mask_step) + self.assert_array_equal(mask_step, + array_ones_in_indices(a_y.size, [9,18])) + + # Test 4 - Consecutive changes + a_y = np.array([-1, 0, 1, 2, 3, 5, 6, 8, 15, 16, 21, 20.1, + 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, + 10., 10.1, 10.2, 10.3, 10.4]) + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + mask_step,mask_spike = get_model_outliers(df) + logger_info('Model 4:', mask_step) + self.assert_array_equal(mask_step, + array_ones_in_indices(a_y.size, [8,19])) + + + ## spikes + + # Test 5 - 2 spikes and 1 step + a_y = np.array([19.8, 19.9, 30.0, 30.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 10.1, 10.2, 10.3, 10.4, + 10.5, 10.6, 30.7, 10.8, 10.9]) + + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + mask_step,mask_spike = get_model_outliers(df) + logger_info('Model 5:', mask_step) + logger_info('mask 5:',mask_spike) + self.assert_array_equal(mask_step, + array_ones_in_indices(a_y.size, [9])) + self.assert_array_equal(mask_spike, + array_zeros_in_indices(a_y.size, [2,3,16])) + + # 5b - with datetime index + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y}, index=a_date).pipe(normalize_df) + + mask_step,mask_spike = get_model_outliers(df) + logger_info('Model 5:', mask_step) + logger_info('mask 5:',mask_spike) + self.assert_array_equal(mask_step, + array_ones_in_indices(a_y.size, [9])) + self.assert_array_equal(mask_spike, + array_zeros_in_indices(a_y.size, [2,3,16])) + + # Test 6 - single spike + a_y = np.array([19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 20.8, 20.9, 21.0, + 21.1, 21.2, 21.3, 21.4, 21.5, 21.6]) + + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + mask_step,mask_spike = get_model_outliers(df) + logger_info('Model 6:', mask_step) + logger_info('mask 6:',mask_spike) + self.assertEquals(str(mask_step), + 'None') + self.assert_array_equal(mask_spike, + array_zeros_in_indices(a_y.size, [9])) + + + # Test 6b - single spike co-located with step + a_y = np.array([19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 30.7, 30.8, 30.9, 31.0, + 31.1, 31.2, 31.3, 31.4, 31.5]) + + df = pd.DataFrame({'y': a_y}).pipe(normalize_df) + + mask_step,mask_spike = get_model_outliers(df) + logger_info('Model 6:', mask_step) + logger_info('mask 6:',mask_spike) + self.assert_array_equal(mask_step, + array_ones_in_indices(a_y.size, [9])) + self.assert_array_equal(mask_spike, + array_zeros_in_indices(a_y.size, [9])) + + + # TODO: Work in progress + def test_get_model_outliers_withgap(self): + + # # Test 1 - short series with null value - nulls cause no outliers + a_y = [0., 1., np.NaN, 3.,4.,5.,6.,7.,] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y, 'date': a_date}).pipe(normalize_df) + + mask_step, mask_spike = get_model_outliers(df) + logger_info('Model 1:', mask_step) + self.assertIsNone(mask_step) + self.assertIsNone(mask_spike) + + # Test 1b - series with multiple values per x -- raises ValueError + a_y = np.arange(0,10.) + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y, 'date': a_date}) + df = pd.concat([df.head(5), df.head(6).tail(2)]).pipe(normalize_df) + + with self.assertRaises(ValueError): + mask_step, mask_spike = get_model_outliers(df) + + # Test 2 - short series with gap value - no real outliers + a_y = np.arange(0,10.) + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y, 'date': a_date}) + df = pd.concat([df.head(5), df.tail(-6)]).pipe(normalize_df) + + mask_step, mask_spike = get_model_outliers(df) + logger_info('Model 1:', mask_step) + self.assertIsNotNone(mask_step) # Incorrectly finds a step + self.assertIsNone(mask_spike) # No spikes + + # Test 2b - after interpolating, can get outliers - finds none + + df_nogap = df.pipe(interpolate_df, include_mask=True) + mask_step, mask_spike = get_model_outliers(df_nogap) + logger_info('df 1 - no gap:', df_nogap) + self.assertIsNone(mask_step) # No steps + self.assertIsNone(mask_spike) # No spikes + + + # # Test 3 - short series with gap value - with outliers + a_y = np.arange(0,10.) + a_y2 = np.arange(1, 11.) + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y, 'date': a_date}) + df2 = pd.DataFrame({'y': a_y2, 'date': a_date}) + df = pd.concat([df.head(5), df2.tail(-6)]).pipe(normalize_df) + + mask_step, mask_spike = get_model_outliers(df) + logger_info('Model 1:', mask_step) + self.assertIsNotNone(mask_step) # Incorrectly finds a step + self.assertIsNone(mask_spike) # No spikes + + # Test 3b - after interpolating with interpolate_df() - TODO: REMOVE THIS + + df_nogap = df.pipe(interpolate_df, include_mask=True) + mask_step, mask_spike = get_model_outliers(df_nogap) + + df_nogap ['mask_step']=mask_step + df_nogap['step_in_filled_gap'] = df_nogap.mask_step*df_nogap.is_gap_filled + + df_nogap['mask_step_patch'] = df_nogap.step_in_filled_gap.shift(-1).fillna(0) + df_nogap = df_nogap.loc[~df_nogap.is_gap_filled] + df_nogap['mask_step_patch'] = df_nogap.mask_step_patch.shift(1).fillna(0) + df_nogap['mask_step'] = df_nogap.mask_step+df_nogap.mask_step_patch + df_nogap = df_nogap[['date','x','y','mask_step']] + logger_info('df 1 - no gap:', df_nogap) + + self.assert_array_equal(df_nogap.mask_step, + array_ones_in_indices(df_nogap.index.size, [5])) + + self.assertIsNone(mask_spike) # No spikes + + # TODO: we need to + # - filter out filled gaps + # - get list of steps + # - if a step is in a filled gap, move to next sample + + # Test 3c - same, with function + + mask_step, mask_spike = get_model_outliers_withgap(df) + logger_info('Model 3c:', mask_step) + self.assert_array_equal(mask_step, + array_ones_in_indices(df_nogap.index.size, [5])) + logger_info('mask_spike:', mask_spike) + logger_info('mask_step:', mask_step) + + self.assertIsNone(mask_spike) # No spikes + + + # TODO: Work in progress + def test_get_model_outliers_adj_season(self): + + # # Test 1 - short series - no outlier + # a_y = np.arange(4) + # a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + # df = pd.DataFrame({'y': a_y, 'date': a_date}).pipe(normalize_df) + # + # model, mask = get_model_outliers(df) + # logger_info('Model 1:', model) + # self.assertIsNone(model) + + # Test 1 - short series - has outlier + a_y = np.array([0., 1., 1000., 3.,4.,5.]) + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + df = pd.DataFrame({'y': a_y, 'date': a_date}).pipe(normalize_df) + + mask_step, mask_spike = get_model_outliers(df) + logger_info('Model 1:', mask_step) + self.assertIsNone(mask_step) + self.assert_array_equal(mask_spike, + array_zeros_in_indices(a_y.size, [2])) + + + + + + def test_find_steps(self): + # No changes + a_y = [20.0, 20.1, 20.2, 20.3, 20.4, 20.5] + steps, spikes = find_steps_and_spikes(a_x=None, a_y=a_y, a_date=None) + assert len(steps) == 0 + assert len(spikes) == 0 + + # Single step + a_y = [19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 10., 10.1, 10.2, 10.3, 10.4, + 10.5, 10.6, 10.7, 10.8, 10.9] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + steps, spikes = find_steps_and_spikes(a_x=None, a_y=a_y, a_date=a_date) + assert steps + assert len(spikes) == 0 + logger.info('Steps found = %s', len(steps)) + assert len(steps) == 1 + + # Single spike + a_y = [19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, + 20.6, 30., 30.1, 20.8, 20.9, 21.0, 21.1, 21.2, + 21.3, 21.4, 21.5] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + steps, spikes = find_steps_and_spikes(a_x=None, a_y=a_y, a_date=a_date) + assert len(steps) == 0 + assert spikes + logger.info('Spikes found = %s', len(spikes)) + assert len(spikes) == 1 + + # Two changes + a_y = [-1, 0, 1, 2, 3, 5, 6, 8, 10, 15, 16, 18, + 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, + 10., 10.1, 10.2, 10.3, 10.4] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + steps, spikes = find_steps_and_spikes(a_x=None, a_y=a_y, a_date=a_date, window=3) + assert steps + assert len(spikes) == 0 + logger.info('Steps found = %s', len(steps)) + assert len(steps) == 2 + + # Consecutive changes + a_y = [-1, 0, 1, 2, 3, 5, 6, 8, 15, 16, 21, 20.1, + 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, + 10., 10.1, 10.2, 10.3, 10.4] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + steps, spikes = find_steps_and_spikes(a_x=None, a_y=a_y, window=3, a_date=a_date) + assert steps + assert len(spikes) == 0 + logger.info('Steps found = %s', len(steps)) + assert len(steps) == 2 + + # Select number of changes + a_y = [-1, 0, 1, 2, 3, 5, 6, 8, 15, 16, 21, 20.1, + 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, + 10., 10.1, 10.2, 10.3, 10.4] + a_date = pd.date_range(start='2018-01-01', periods=len(a_y), freq='D') + steps, spikes = find_steps_and_spikes(a_x=None, a_y=a_y, a_date=a_date, window=3, max_changes=1) + assert steps + assert len(spikes) == 0 + logger.info('Steps found = %s', len(steps)) + assert len(steps) == 1 + # d = changes[0] + # assert d['change_type'] == 'step' + # assert d['duration'] == 3 + # Difference would be 7 (increase from 8 -> 15) + 5 (increase from 16 -> 21) = 12 + # self.assertAlmostEqual(first=d['diff'], second=12.0) + + def test_fixed_model_creation(self): + a_x = np.arange(0, 10) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + + a1 = model_constant(a_x, a_date, np.array([42])) + model_constant_fixed = get_fixed_model(model_constant, np.array([42])) + print(model_constant_fixed) + a2 = model_constant_fixed(a_x, a_date, None) + self.assert_array_equal(a1, a2) + + def test_fix_params_fmodel(self): + a_x = np.arange(0, 10) + a_date = pd.date_range('2014-01-01', periods=10, freq='D') + + a1 = model_linear(a_x, a_date, np.array([10., -1.])) + model_linear_fixed = fix_params_fmodel(model_linear, [10., np.NaN]) + logger_info('model_linear_fixed:', model_linear_fixed) + self.assertEquals(model_linear_fixed.n_params, 1) + a2 = model_linear_fixed(a_x, a_date, params=[-1.]) + self.assert_array_equal(a1, a2) + + # TODO: Implement test + def test_validate_model_bounds(self): + pass + + def test_get_l_model_auto_season(self): + + # 0. Test for series with single sample + a_date = pd.a_date = pd.date_range('2014-01-01', periods=1, freq='D') + l_expected = [model_null] + l_result = get_l_model_auto_season(a_date) + self.assert_array_equal(l_result, l_expected) + + # 1. Tests for series with daily samples + + # Test 1.1 - not enough samples for weekly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=10, freq='D') + l_expected = [model_null] + l_result = get_l_model_auto_season(a_date) + self.assert_array_equal(l_result, l_expected) + + # Test 1.2 - enough samples for weekly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=12, freq='D') + l_expected = [model_null, model_season_wday] + l_result = get_l_model_auto_season(a_date, min_periods=1.5) + self.assert_array_equal(l_result, l_expected) + + # Test 1.3 - Weekly and yearly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=549, freq='D') + l_expected = [model_null, model_season_wday * model_season_fourier_yearly, + model_season_wday, model_season_fourier_yearly] + l_result = get_l_model_auto_season(a_date, min_periods=1.5, season_add_mult='mult') + self.assert_array_equal(l_result, l_expected) + + l_expected = [model_null, model_season_wday + model_season_fourier_yearly, model_season_wday, + model_season_fourier_yearly] + l_result = get_l_model_auto_season(a_date, min_periods=1.5, season_add_mult='add') + self.assert_array_equal(l_result, l_expected) + + + # 2. Tests for series with weekly samples + + # Test 2.2 - not enough samples for yearly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=12, freq='W') + l_expected = [model_null] + l_result = get_l_model_auto_season(a_date, min_periods=1.5) + self.assert_array_equal(l_result, l_expected) + + # Test 2.3 - Weekly and yearly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=80, freq='W') + l_expected = [model_null, model_season_fourier_yearly] + l_result = get_l_model_auto_season(a_date, min_periods=1.5) + self.assert_array_equal(l_result, l_expected) + + # 3. Tests for series with monthly samples + + # Test 3.2 - not enough samples for yearly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=12, freq='M') + l_expected = [model_null] + l_result = get_l_model_auto_season(a_date, min_periods=1.5) + self.assert_array_equal(l_result, l_expected) + + # Test 3.3 - Weekly and yearly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=20, freq='M') + l_expected = [model_null, model_season_fourier_yearly] + l_result = get_l_model_auto_season(a_date, min_periods=1.5) + self.assert_array_equal(l_result, l_expected) + + # 4. Tests for series with quarterly samples + + # Test 4.2 - not enough samples for yearly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=5, freq='Q') + l_expected = [model_null] + l_result = get_l_model_auto_season(a_date, min_periods=1.5) + self.assert_array_equal(l_result, l_expected) + + # Test 4.3 - Weekly and yearly seasonality + a_date = pd.a_date = pd.date_range('2014-01-01', periods=7, freq='Q') + l_expected = [model_null, model_season_fourier_yearly] + l_result = get_l_model_auto_season(a_date, min_periods=1.5) + self.assert_array_equal(l_result, l_expected) + + def test_simplify_model(self): + # Test 1: normal bounds + model_dummy = Namespace() + model_dummy.f_bounds = lambda a_x, a_y, a_date: (np.array([3.]), np.array([7.])) + model_dummy.n_params = 1 + model_dummy.name = 'dummy' + + model_result = simplify_model(model_dummy) + logger_info('model_dummy', model_dummy) + logger_info('result:', model_result) + self.assertEquals(model_dummy, model_result) + + # Test 2: min and max bounds match - model transformed into fixed model + model_dummy = Namespace() + model_dummy.f_bounds = lambda a_x, a_y, a_date: (np.array([5.]), np.array([5.])) + model_dummy.n_params = 1 + model_dummy.name = 'dummy' + + model_result = simplify_model(model_dummy) + logger_info('model_dummy', model_dummy) + logger_info('result:', model_result) + self.assertEquals(model_result.n_params, 0) + + def test_validate_initial_guess(self): + result = validate_initial_guess(np.array([5., 5.]), + (np.array([0., 0.]), np.array([10., 10.]))) + self.assertTrue(result) + + result = validate_initial_guess(np.array([0., 10.]), + (np.array([0., 0.]), np.array([10., 10.]))) + self.assertTrue(result) + + result = validate_initial_guess(np.array([-1., 11.]), + (np.array([0., 0.]), np.array([10., 10.]))) + self.assertFalse(result) diff --git a/tests/test_forecast_plot.py b/tests/test_forecast_plot.py new file mode 100644 index 0000000..63ea7b9 --- /dev/null +++ b/tests/test_forecast_plot.py @@ -0,0 +1,157 @@ +# -- Public Imports + +import logging +import unittest +from itertools import chain, repeat +import matplotlib.pyplot as plt +import numpy as np +import os +import itertools +import pandas as pd +from unittest import TestCase + +# -- Private Imports +from anticipy.utils_test import PandasTest +from anticipy import forecast_plot + +# -- Globals + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def logger_info(msg, data): + logger.info(msg + '\n%s', data) + + +base_folder = os.path.join(os.path.dirname(__file__), 'test_plots') + +def get_path_test_plot(name, is_matplotlib=False): + if is_matplotlib: + name = name+'_mpl' + file_name = '{}.png'.format(name) + return os.path.join(base_folder, file_name) + + +df_forecast = ( + pd.concat([ + pd.DataFrame({'date': pd.date_range('2018-01-01', periods=6, freq='D'), + 'model': 'actuals', + 'y': 1000 * np.arange(0., 6.), + 'is_actuals': True + }), + pd.DataFrame({'date': pd.date_range('2018-01-01', periods=10, freq='D'), + 'model': 'forecast', + 'y': 1000 * np.full(10, 5.), + 'is_actuals': False + }), + + ], sort=False, ignore_index=True) +) + +df_forecast_pi = ( + pd.concat([ + pd.DataFrame({'date': pd.date_range('2018-01-01', periods=6, freq='D'), + 'model': 'actuals', + 'y': 1000 * np.arange(0., 6.), + 'is_actuals': True + }), + pd.DataFrame({'date': pd.date_range('2018-01-01', periods=6, freq='D'), + 'model': 'forecast', + 'y': 1000 * np.full(6, 5.), + 'is_actuals': False + }), + pd.DataFrame({'date': pd.date_range('2018-01-07', periods=4, freq='D'), + 'model': 'forecast', + 'y': 1000 * np.full(4, 5.), + 'is_actuals': False, + 'q5': 1000 * np.full(4, 4.), + 'q20': 1000 * np.full(4, 4.5), + 'q80': 1000 * np.full(4, 5.5), + 'q95': 1000 * np.full(4, 6.), + }), + + ], sort=False, ignore_index=True) +) + +# Dataframe with different data sources, to plot with faceting +df_forecast_p1 = df_forecast.copy() +df_forecast_p2 = df_forecast.copy() +df_forecast_p1['source'] = 'ts1' +df_forecast_p2['source'] = 'ts2' +df_forecast_facet = pd.concat([df_forecast_p1, df_forecast_p2], sort=False, ignore_index=True) + +df_forecast_p3 = df_forecast.copy() +df_forecast_p4 = df_forecast.copy() +df_forecast_p5 = df_forecast.copy() +df_forecast_p3['source'] = 'ts3' +df_forecast_p4['source'] = 'ts4' +df_forecast_p5['source'] = 'ts5' +df_forecast_facet_5 = pd.concat([df_forecast_p1, + df_forecast_p2, + df_forecast_p3, + df_forecast_p4, + df_forecast_p5], sort=False, ignore_index=True) + + +# As above, with prediction interval +# Dataframe with different data sources, to plot with faceting +df_forecast_p1_pi = df_forecast_pi.copy() +df_forecast_p2_pi = df_forecast_pi.copy() +df_forecast_p1_pi['source'] = 'ts1' +df_forecast_p2_pi['source'] = 'ts2' +df_forecast_facet_pi = pd.concat([df_forecast_p1_pi, df_forecast_p2_pi], sort=False, ignore_index=True) + + + +class TestForecastPlot(PandasTest): + + def test_ggplot_fcast_save(self): + is_matplotlib=True + path = get_path_test_plot('test',is_matplotlib) + forecast_plot.plot_forecast_save(df_forecast, path, 400, 300, 'Test Plot') + logger_info('plot saved to :', path) + + path = get_path_test_plot('test_k',is_matplotlib) + forecast_plot.plot_forecast_save(df_forecast, path, 400, 300, 'Test Plot', scale='k') + logger_info('plot saved to :', path) + + path = get_path_test_plot('test_m',is_matplotlib) + forecast_plot.plot_forecast_save(df_forecast, path, 400, 300, 'Test Plot', scale='M') + logger_info('plot saved to :', path) + + # Todo: add checks about file creation, cleanup after running + + logger_info('debug - df_forecast_facet', df_forecast_facet) + + path = get_path_test_plot('test_facet',is_matplotlib) + forecast_plot.plot_forecast_save(df_forecast_facet, path, 400, 300, 'Test Plot') + logger_info('plot saved to :', path) + + ## Repeat test with prediction intervals + # TODO: ADD _PI TO PATH NAME + + path = get_path_test_plot('test',is_matplotlib) + forecast_plot.plot_forecast_save(df_forecast, path, 400, 300, 'Test Plot') + logger_info('plot saved to :', path) + + path = get_path_test_plot('test_k',is_matplotlib) + forecast_plot.plot_forecast_save(df_forecast, path, 400, 300, 'Test Plot', scale='k') + logger_info('plot saved to :', path) + + path = get_path_test_plot('test_m',is_matplotlib) + forecast_plot.plot_forecast_save(df_forecast, path, 400, 300, 'Test Plot', scale='M') + logger_info('plot saved to :', path) + + # Todo: add checks about file creation, cleanup after running + + logger_info('debug - df_forecast_facet', df_forecast_facet) + + path = get_path_test_plot('test_facet') + forecast_plot.plot_forecast_save(df_forecast_facet, path, 400, 300, 'Test Plot') + logger_info('plot saved to :', path) + + def test_plot_forecast(self): + i = forecast_plot.plot_forecast(df_forecast, 400, 300, 'Test Plot') + logger_info('plot output:', repr(i)) + # Todo: add checks to validate Ipython.Image instance diff --git a/tests/test_model_utils.py b/tests/test_model_utils.py new file mode 100644 index 0000000..f5be388 --- /dev/null +++ b/tests/test_model_utils.py @@ -0,0 +1,192 @@ +""" + +Author: Pedro Capelastegui +Created on 04/12/2015 +""" + +import logging +import unittest +from itertools import chain, repeat + +import numpy as np +import os +import itertools +import pandas as pd +from unittest import TestCase +# This line fixes import errors +from anticipy.utils_test import PandasTest +from anticipy.model_utils import * +from anticipy import forecast_models + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def logger_info(msg, data): + logger.info(msg + '\n%s', data) + + +# class TestModelUtils(TestCase): +class TestModelUtils(PandasTest): + def setUp(self): + pass + + def test_array_transpose(self): + a = np.arange(10) + self.assertEqual(a.shape, (10,)) + at = array_transpose(a) + self.assertEqual(at.shape, (10, 1)) + + def test_apply_a_x_scaling(self): + a_x = np.arange(10) + l_models = [ + # No model - default config + None, + # Model requires omega n + forecast_models.model_linear, + # Model requires scaling + forecast_models.model_sigmoid, + # aperiodic_models.get_model_logistic_4_param # Todo: Implement and test model function with a_x scaling + ] + for model in l_models: + # No asserts - just check that the function runs without crashing, and manually check results in logs + a_x = apply_a_x_scaling(a_x, model) + logger.info('f_model: %s', model) + logger_info('a_x', a_x) + + def test_get_a_x_date_extrapolate(self): + # TODO: TEST Output size, scenarios with different frequencies + l_df_y = [ + # Single ts + pd.Series(index=pd.date_range('2016-01-01', periods=10, freq='W'), + data=np.arange(10)), + # Multiple ts + pd.DataFrame(index=pd.date_range('2016-01-01', periods=10, freq='W'), + data={'a': np.arange(10), 'b': np.arange(10)}) + ] + l_models = [ + # No model - default config + None, + # Model requires omega n + forecast_models.model_linear, + # Model requires scaling + forecast_models.model_sigmoid, + # aperiodic_models.get_model_logistic_4_param # Todo: Implement and test model function with a_x scaling + ] + l_time_resolutions = [ + # Default config + 'W-SUN', + 'W', + 'W-MON', + 'D', + 'MS', + 'YS' + ] + # logger_info('list_ts',l_df_y) + for (df_type, model, time_resolution) in itertools.product(['single', 'multi'], l_models, l_time_resolutions): + dict_df = { + # Single ts + 'single': pd.Series(index=pd.date_range('2016-01-01', periods=10, freq=time_resolution), + data=np.arange(10)), + # Multiple ts + 'multi': pd.DataFrame(index=pd.date_range('2016-01-01', periods=10, freq=time_resolution), + data={'a': np.arange(10), 'b': np.arange(10)}) + } + ts = dict_df.get(df_type) + + # No asserts - just check that the function runs without crashing, and manually check results in logs + s_x = get_s_x_extrapolate(ts.index.min(), ts.index.max(), model=model, freq=time_resolution, + extrapolate_years=1.0) + logger.info('type of df: %s, f_model: %s , time_resolution: %s', df_type, model, time_resolution) + logger_info('s_x', s_x.tail(3)) + logger_info('a_dates', s_x.tail(3).index) + self.assertIsInstance(s_x.index, pd.DatetimeIndex) + self.assertLessEqual(s_x.index.max(), ts.index.max() + 1.1 * pd.Timedelta(1, 'Y')) + self.assertGreaterEqual(s_x.index.max(), ts.index.max() + 0.9 * pd.Timedelta(1, 'Y')) + + # Check that all actuals values are in extrapolated series + self.assertEquals(np.setdiff1d(ts.index, s_x.index).size, 0) + + ts = l_df_y[0] + model = l_models[0] + time_resolution = l_time_resolutions[0] + s_x = get_s_x_extrapolate(ts.index.min(), ts.index.max(), model=model, freq=time_resolution, + extrapolate_years=3.0) + logger.info('# of ts: %s, f_model: %s , time_resolution: %s', ts.shape, model, time_resolution) + logger_info('a_x', s_x.head(3)) + logger_info('a_x index', s_x.head(3).index) + self.assertIsInstance(s_x.index, pd.DatetimeIndex) + logger_info('t_values len', len(s_x)) + self.assertEquals(len(s_x), 10 + 3.0 * 52) + + # Test with freq='D' + l_df_y = [ + # Single ts + pd.Series(index=pd.date_range('2016-01-01', periods=10, freq='D'), + data=np.arange(10)), + # Multiple ts + pd.DataFrame(index=pd.date_range('2016-01-01', periods=10, freq='D'), + data={'a': np.arange(10), 'b': np.arange(10)}) + ] + l_models = [ + # No model - default config + None, + # Model requires omega n + forecast_models.model_linear, + # Model requires scaling + # aperiodic_models.get_model_logistic_4_param # Todo: Implement and test model function with a_x scaling + ] + l_time_resolutions = [ + # Default config + 'D' + ] + logger_info('list_ts', l_df_y) + for (ts, model, time_resolution) in itertools.product(l_df_y, l_models, l_time_resolutions): + # No asserts - just check that the function runs without crashing, and manually check results in logs + s_x = get_s_x_extrapolate(ts.index.min(), ts.index.max(), model=model, freq=time_resolution) + logger.info('# of ts: %s, f_model: %s , time_resolution: %s', ts.shape, model, time_resolution) + logger_info('s_x', s_x.tail(3)) + logger_info('a_dates', s_x.tail(3).index) + self.assertIsInstance(s_x.index, pd.DatetimeIndex) + + ts = l_df_y[0] + model = l_models[0] + time_resolution = l_time_resolutions[0] + s_x = get_s_x_extrapolate(ts.index.min(), ts.index.max(), model=model, freq=time_resolution, + extrapolate_years=3.0) + logger.info('# of ts: %s, f_model: %s , time_resolution: %s', ts.shape, model, time_resolution) + logger_info('t_values', s_x.tail(3)) + logger_info('t_values_index', s_x.index) + self.assertIsInstance(s_x.index, pd.DatetimeIndex) + logger_info('t_values len', len(s_x)) + self.assertEquals(len(s_x), 10 + 3.0 * 365) + + def test_get_aic_c(self): + + # Known error scenario: 0 error, 1 parameters - should return -inf + aic_c1 = get_aic_c(0, 10, 1) + logger_info('AIC_C:', aic_c1) + self.assertTrue(np.isneginf(aic_c1)) + + def print_aic_c(fit_error, n, n_params): + aic_c1 = get_aic_c(fit_error, n, n_params) + logger.info('AIC_C (%s, %s, %s): %s', fit_error, n, n_params, aic_c1) + + print_aic_c(0.1, 10, 1) + print_aic_c(0.1, 10, 2) + print_aic_c(0.1, 10, 3) + print_aic_c(0.001, 10, 1) + print_aic_c(0.001, 10, 2) + print_aic_c(0.001, 10, 3) + print_aic_c(0.1, 100, 1) + print_aic_c(0.1, 100, 2) + print_aic_c(0.1, 100, 3) + print_aic_c(0, 10, 1) + print_aic_c(0, 10, 2) + print_aic_c(0, 10, 3) + + def test_get_s_aic_c_best_result_key(self): + s_tmp = pd.DataFrame({'c1': [1], 'c2': [2], 'c3': [-np.inf]}).set_index(['c1', 'c2'])['c3'] + result1 = get_s_aic_c_best_result_key(s_tmp) + logger_info('DEBUG: ', result1) + self.assertTupleEqual(get_s_aic_c_best_result_key(s_tmp), (1, 2))