From 0dd4b1413ad6e44bff1582b0c8d6da0d29e13e77 Mon Sep 17 00:00:00 2001 From: "Kamil A. Kaczmarek" Date: Tue, 5 Jun 2018 13:53:08 +0200 Subject: [PATCH] Dev solution 1 (#40) * dev-Solution1 (#16) * solution 1 * Add features * random-search (#17) * random-search * Update neptune.yaml * Solution 1 - upgrade (#18) * Remove save_eval function * Add more features * Refactor code - steppy 0.1.1 (#22) * Refactor code * Include suggested changes * Update models.py * Verifying submittion (#26) * Refactor code * Do not save submission if dev_mode * Verifying submittion * Add USELESS_COLUMNS * Include remaining features * Fix random search * added requirements * optimized imports * Fix solution-1 (#38) * Refactor code * Fix * small refactor * added neptune-cli to requirements * added steppy-toolkit to requirements * one line clip in postprocessing (#39) * cleaning code for realease of the solution-1 * cleaning code for the solution-1 * cleaning code for the solution-1 (again) --- .gitignore | 1 + feature_extraction.py | 90 ++++++++++++++ hyperparameter_tuning.py | 155 +++++++++++++++++++++++ main.py | 184 +++++++++++++++++++++++++++ models.py | 31 +++++ neptune.yaml | 57 +++++++++ neptune_random_search.yaml | 57 +++++++++ pipeline_config.py | 180 +++++++++++++++++++++++++++ pipelines.py | 248 +++++++++++++++++++++++++++++++++++++ postprocessing.py | 13 ++ requirements.txt | 11 ++ utils.py | 103 +++++++++++++++ 12 files changed, 1130 insertions(+) create mode 100644 feature_extraction.py create mode 100644 hyperparameter_tuning.py create mode 100644 main.py create mode 100644 models.py create mode 100644 neptune.yaml create mode 100644 neptune_random_search.yaml create mode 100644 pipeline_config.py create mode 100644 pipelines.py create mode 100644 postprocessing.py create mode 100644 requirements.txt create mode 100644 utils.py diff --git a/.gitignore b/.gitignore index dbb7aae..96eb0db 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ tests/.cache .cache/ .idea/ .idea_modules/ +*_local.yaml out/ output output/ diff --git a/feature_extraction.py b/feature_extraction.py new file mode 100644 index 0000000..6d90588 --- /dev/null +++ b/feature_extraction.py @@ -0,0 +1,90 @@ +import category_encoders as ce +import numpy as np +import pandas as pd +from sklearn.externals import joblib +from steppy.adapters import to_numpy_label_inputs +from steppy.base import BaseTransformer +from steppy.utils import get_logger + +logger = get_logger() + + +class DataFrameByTypeSplitter(BaseTransformer): + def __init__(self, numerical_columns, categorical_columns, timestamp_columns): + self.numerical_columns = numerical_columns + self.categorical_columns = categorical_columns + self.timestamp_columns = timestamp_columns + + def transform(self, X, y=None, **kwargs): + outputs = {} + + if self.numerical_columns is not None: + outputs['numerical_features'] = X[self.numerical_columns] + + if self.categorical_columns is not None: + outputs['categorical_features'] = X[self.categorical_columns] + + if self.timestamp_columns is not None: + outputs['timestamp_features'] = X[self.timestamp_columns] + + return outputs + + +class FeatureJoiner(BaseTransformer): + def transform(self, numerical_feature_list, categorical_feature_list, **kwargs): + features = numerical_feature_list + categorical_feature_list + for feature in features: + feature.reset_index(drop=True, inplace=True) + outputs = {} + outputs['features'] = pd.concat(features, axis=1).astype(np.float32) + outputs['feature_names'] = self._get_feature_names(features) + outputs['categorical_features'] = self._get_feature_names(categorical_feature_list) + return outputs + + def _get_feature_names(self, dataframes): + feature_names = [] + for dataframe in dataframes: + try: + feature_names.extend(list(dataframe.columns)) + except Exception as e: + print(e) + feature_names.append(dataframe.name) + + return feature_names + + +class TargetEncoder(BaseTransformer): + def __init__(self, **kwargs): + self.params = kwargs + self.encoder_class = ce.TargetEncoder + + def fit(self, X, y, **kwargs): + categorical_columns = list(X.columns) + self.target_encoder = self.encoder_class(cols=categorical_columns, **self.params) + self.target_encoder.fit(X, y) + return self + + def transform(self, X, y=None, **kwargs): + X_ = self.target_encoder.transform(X) + return {'categorical_features': X_} + + def load(self, filepath): + self.target_encoder = joblib.load(filepath) + return self + + def save(self, filepath): + joblib.dump(self.target_encoder, filepath) + + +class ToNumpyLabel(BaseTransformer): + def __init__(self, **kwargs): + self.y = None + + def fit(self, y, **kwargs): + self.y = to_numpy_label_inputs(y) + return self + + def transform(self, **kwargs): + if self.y.any(): + return {'y': self.y} + return {} diff --git a/hyperparameter_tuning.py b/hyperparameter_tuning.py new file mode 100644 index 0000000..4ac478c --- /dev/null +++ b/hyperparameter_tuning.py @@ -0,0 +1,155 @@ +import gc + +import numpy as np +from deepsense import neptune +from sklearn.externals import joblib +from steppy.base import BaseTransformer + +from utils import set_seed + + +class RandomSearchOptimizer(BaseTransformer): + def __init__(self, TransformerClass, params, + score_func, maximize, + train_input_keys, valid_input_keys, + n_runs, + callbacks=[]): + self.TransformerClass = TransformerClass + self.param_space = create_param_space(params, n_runs) + self.train_input_keys = train_input_keys + self.valid_input_keys = valid_input_keys + self.score_func = score_func + self.maximize = maximize + self.callbacks = callbacks + self.best_transformer = TransformerClass(**self.param_space[0]) + + def fit(self, **kwargs): + if self.train_input_keys: + train_inputs = {input_key: kwargs[input_key] for input_key in self.train_input_keys} + else: + train_inputs = kwargs + X_valid, y_valid = kwargs[self.valid_input_keys[0]], kwargs[self.valid_input_keys[1]] + + results = [] + for i, param_set in enumerate(self.param_space): + try: + transformer = self.TransformerClass(**param_set) + transformer.fit(**train_inputs) + except Exception: + continue + y_pred_valid = transformer.transform(X_valid) + y_pred_valid_value = list(y_pred_valid.values())[0] + run_score = self.score_func(y_valid, y_pred_valid_value) + results.append((run_score, param_set)) + + del y_pred_valid, transformer + gc.collect() + + for callback in self.callbacks: + callback.on_run_end(score=run_score, params=param_set) + + assert len(results) > 0, 'All random search runs failed, check your parameter space' + results_sorted = sorted(results, key=lambda x: x[0]) + + if self.maximize: + best_score, best_param_set = results_sorted[-1] + else: + best_score, best_param_set = results_sorted[0] + + for callback in self.callbacks: + callback.on_search_end(results=results) + + self.best_transformer = self.TransformerClass(**best_param_set) + self.best_transformer.fit(**train_inputs) + return self + + def transform(self, **kwargs): + return self.best_transformer.transform(**kwargs) + + def save(self, filepath): + self.best_transformer.save(filepath) + + def load(self, filepath): + self.best_transformer.load(filepath) + return self + + +def create_param_space(params, n_runs): + seed = np.random.randint(1000) + param_space = [] + for i in range(n_runs): + set_seed(seed + i) + param_choice = {} + for param, value in params.items(): + if isinstance(value, list): + if len(value) == 2: + mode = 'choice' + param_choice[param] = sample_param_space(value, mode) + else: + mode = value[-1] + param_choice[param] = sample_param_space(value[:-1], mode) + else: + param_choice[param] = value + param_space.append(param_choice) + return param_space + + +def sample_param_space(value_range, mode): + if mode == 'list': + value = np.random.choice(value_range) + else: + range_min, range_max = value_range + if mode == 'choice': + value = np.random.choice(range(range_min, range_max, 1)) + elif mode == 'uniform': + value = np.random.uniform(low=range_min, high=range_max) + elif mode == 'log-uniform': + value = np.exp(np.random.uniform(low=np.log(range_min), high=np.log(range_max))) + else: + raise NotImplementedError + return value + + +class GridSearchCallback: + def on_run_end(self, score, params): + return NotImplementedError + + def on_search_end(self, results): + return NotImplementedError + + +class NeptuneMonitor(GridSearchCallback): + def __init__(self, name): + self.name = name + self.ctx = neptune.Context() + self.highest_params_channel = self._create_text_channel(name='highest params') + self.lowest_params_channel = self._create_text_channel(name='lowest params') + self.run_params_channel = self._create_text_channel(name='run params') + self.run_id = 0 + + def on_run_end(self, score, params): + self.ctx.channel_send('score on run', x=self.run_id, y=score) + self.run_params_channel.send(y=params) + self.run_id += 1 + + def on_search_end(self, results): + results_sorted = sorted(results, key=lambda x: x[0]) + highest_score, highest_param_set = results_sorted[-1] + lowest_score, lowest_param_set = results_sorted[0] + + self.ctx.channel_send('highest score', x=0, y=highest_score) + self.ctx.channel_send('lowest score', x=0, y=lowest_score) + + self.highest_params_channel.send(y=highest_param_set) + self.lowest_params_channel.send(y=lowest_param_set) + + def _create_text_channel(self, name=''): + return self.ctx.create_channel(name=name, channel_type=neptune.ChannelType.TEXT) + + +class SaveResults(GridSearchCallback): + def __init__(self, filepath): + self.filepath = filepath + + def on_search_end(self, results): + joblib.dump(results, self.filepath) diff --git a/main.py b/main.py new file mode 100644 index 0000000..536e417 --- /dev/null +++ b/main.py @@ -0,0 +1,184 @@ +import os +import shutil + +import click +import pandas as pd +from deepsense import neptune +from sklearn.metrics import roc_auc_score + +import pipeline_config as cfg +from pipelines import PIPELINES +from utils import create_submission, init_logger, read_params, save_evaluation_predictions, \ + set_seed, stratified_train_valid_split, verify_submission + +set_seed(1234) +logger = init_logger() +ctx = neptune.Context() +params = read_params(ctx) + + +@click.group() +def action(): + pass + + +@action.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def train(pipeline_name, dev_mode): + _train(pipeline_name, dev_mode) + + +def _train(pipeline_name, dev_mode): + if bool(params.overwrite) and os.path.isdir(params.experiment_dir): + shutil.rmtree(params.experiment_dir) + + logger.info('reading data in') + if dev_mode: + meta_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE) + else: + meta_train = pd.read_csv(params.train_filepath) + + meta_train_split, meta_valid_split = stratified_train_valid_split(meta_train, + target_column=cfg.TARGET_COLUMNS, + target_bins=params.target_bins, + valid_size=params.validation_size, + random_state=1234) + + logger.info('Target distribution in train: {}'.format(meta_train_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Target distribution in valid: {}'.format(meta_valid_split[cfg.TARGET_COLUMNS].mean())) + + logger.info('shuffling data') + meta_train_split = meta_train_split.sample(frac=1) + meta_valid_split = meta_valid_split.sample(frac=1) + + data = {'input': {'X': meta_train_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y': meta_train_split[cfg.TARGET_COLUMNS], + 'X_valid': meta_valid_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y_valid': meta_valid_split[cfg.TARGET_COLUMNS], + }, + } + + pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG) + pipeline.clean_cache() + pipeline.fit_transform(data) + pipeline.clean_cache() + + +@action.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def evaluate(pipeline_name, dev_mode): + _evaluate(pipeline_name, dev_mode) + + +def _evaluate(pipeline_name, dev_mode): + logger.info('reading data in') + if dev_mode: + meta_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE) + else: + meta_train = pd.read_csv(params.train_filepath) + + _, meta_valid_split = stratified_train_valid_split(meta_train, + target_column=cfg.TARGET_COLUMNS, + target_bins=params.target_bins, + valid_size=params.validation_size, + random_state=1234) + + logger.info('Target distribution in valid: {}'.format(meta_valid_split[cfg.TARGET_COLUMNS].mean())) + + data = {'input': {'X': meta_valid_split, + 'y': None, + }, + } + pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) + pipeline.clean_cache() + output = pipeline.transform(data) + pipeline.clean_cache() + y_pred = output['clipped_prediction'] + y_true = meta_valid_split[cfg.TARGET_COLUMNS].values.reshape(-1) + + logger.info('Saving evaluation predictions') + save_evaluation_predictions(params.experiment_dir, y_true, y_pred, meta_valid_split) + + logger.info('Calculating ROC_AUC Full Scores') + score = roc_auc_score(y_true, y_pred) + logger.info('ROC_AUC score on validation is {}'.format(score)) + ctx.channel_send('ROC_AUC', 0, score) + + +@action.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def predict(pipeline_name, dev_mode): + _predict(pipeline_name, dev_mode) + + +def _predict(pipeline_name, dev_mode): + logger.info('reading data in') + if dev_mode: + meta_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE) + else: + meta_test = pd.read_csv(params.test_filepath) + + data = {'input': {'X': meta_test, + 'y': None, + }, + } + + pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) + pipeline.clean_cache() + output = pipeline.transform(data) + pipeline.clean_cache() + y_pred = output['clipped_prediction'] + + logger.info('creating submission...') + submission = create_submission(meta_test, y_pred) + + logger.info('verifying submittion') + sample_submission = pd.read_csv(params.sample_submission_filepath) + verify_submission(submission, sample_submission) + + if dev_mode: + logger.info('submittion can\'t be saved in dev mode') + else: + submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') + submission.to_csv(submission_filepath, index=None, encoding='utf-8') + logger.info('submission saved to {}'.format(submission_filepath)) + logger.info('submission head \n\n{}'.format(submission.head())) + + +@action.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def train_evaluate_predict(pipeline_name, dev_mode): + logger.info('TRAINING') + _train(pipeline_name, dev_mode) + logger.info('EVALUATION') + _evaluate(pipeline_name, dev_mode) + logger.info('PREDICTION') + _predict(pipeline_name, dev_mode) + + +@action.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def evaluate_predict(pipeline_name, dev_mode): + logger.info('EVALUATION') + _evaluate(pipeline_name, dev_mode) + logger.info('PREDICTION') + _predict(pipeline_name, dev_mode) + + +@action.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def train_evaluate(pipeline_name, dev_mode): + logger.info('TRAINING') + _train(pipeline_name, dev_mode) + logger.info('EVALUATION') + _evaluate(pipeline_name, dev_mode) + + +if __name__ == "__main__": + action() diff --git a/models.py b/models.py new file mode 100644 index 0000000..705ba45 --- /dev/null +++ b/models.py @@ -0,0 +1,31 @@ +import lightgbm as lgb +import numpy as np +from steppy.adapters import to_numpy_label_inputs +from toolkit.misc import LightGBM + + +class LightGBMLowMemory(LightGBM): + def fit(self, X, y, X_valid, y_valid, feature_names=None, categorical_features=None, **kwargs): + y = to_numpy_label_inputs([y]) + y_valid = to_numpy_label_inputs([y_valid]) + + X = X[feature_names].values.astype(np.float32) + y = y.astype(np.float32) + + X_valid = X_valid[feature_names].values.astype(np.float32) + y_valid = y_valid.astype(np.float32) + + train = lgb.Dataset(X, label=y) + valid = lgb.Dataset(X_valid, label=y_valid) + + self.evaluation_results = {} + self.estimator = lgb.train(self.model_config, + train, valid_sets=[valid], valid_names=['valid'], + feature_name=feature_names, + categorical_feature=categorical_features, + evals_result=self.evaluation_results, + num_boost_round=self.training_config.number_boosting_rounds, + early_stopping_rounds=self.training_config.early_stopping_rounds, + verbose_eval=self.model_config.verbose, + feval=self.evaluation_function) + return self diff --git a/neptune.yaml b/neptune.yaml new file mode 100644 index 0000000..aa6b307 --- /dev/null +++ b/neptune.yaml @@ -0,0 +1,57 @@ +project: HC + +name: home-credit-default-risk +tags: [solution-1] + +metric: + channel: 'ROC_AUC' + goal: maximize + +exclude: + - output + - imgs + - neptune.log + - offline_job.log + - .git + - .idea + - .ipynb_checkpoints + - Untitled.ipynb + +parameters: +# Data + train_filepath: YOUR/PATH/TO/application_train.csv + test_filepath: YOUR/PATH/TO/application_test.csv + sample_submission_filepath: YOUR/PATH/TO/sample_submission.csv + experiment_dir: YOUR/PATH/WORKDIR + +# Validation + target_bins: 100 + validation_size: 0.2 + +# Execution + overwrite: 1 + num_workers: 10 + verbose: 1 + +# Preprocessing + target_encoder__n_splits: 10 + +# Light GBM + lgbm_random_search_runs: 0 + lgbm__boosting_type: gbdt + lgbm__objective: binary + lgbm__metric: auc + lgbm__number_boosting_rounds: 500 + lgbm__early_stopping_rounds: 30 + lgbm__learning_rate: 0.05 + lgbm__num_leaves: 31 + lgbm__max_depth: 16 + lgbm__min_child_samples: 42 + lgbm__max_bin: 300 + lgbm__subsample: 0.8 + lgbm__subsample_freq: 5 + lgbm__colsample_bytree: 0.8 + lgbm__min_child_weight: 4 + lgbm__reg_lambda: 0.0 + lgbm__reg_alpha: 0.1 + lgbm__scale_pos_weight: 1 diff --git a/neptune_random_search.yaml b/neptune_random_search.yaml new file mode 100644 index 0000000..af1ac4d --- /dev/null +++ b/neptune_random_search.yaml @@ -0,0 +1,57 @@ +project: HC + +name: home-credit-default-risk +tags: [solution-1] + +metric: + channel: 'ROC_AUC' + goal: maximize + +exclude: + - output + - imgs + - neptune.log + - offline_job.log + - .git + - .idea + - .ipynb_checkpoints + - Untitled.ipynb + +parameters: +# Data + train_filepath: YOUR/PATH/TO/application_train.csv + test_filepath: YOUR/PATH/TO/application_test.csv + sample_submission_filepath: YOUR/PATH/TO/sample_submission.csv + experiment_dir: YOUR/PATH/WORKDIR + +# Validation + target_bins: 100 + validation_size: 0.2 + +# Execution + overwrite: 1 + num_workers: 10 + verbose: 1 + +# Preprocessing + target_encoder__n_splits: 10 + +# Light GBM + lgbm_random_search_runs: 10 + lgbm__boosting_type: gbdt + lgbm__objective: binary + lgbm__metric: auc + lgbm__number_boosting_rounds: 1000 + lgbm__early_stopping_rounds: 30 + lgbm__learning_rate: '[0.01, 0.2, "uniform"]' + lgbm__num_leaves: '[25, 35]' + lgbm__max_depth: '[10, 20]' + lgbm__min_child_samples: '[35, 45]' + lgbm__max_bin: '[275, 325]' + lgbm__subsample: '[0.8, 1., 1.2, "list"]' + lgbm__subsample_freq: 5 + lgbm__colsample_bytree: 0.8 + lgbm__min_child_weight: 4 + lgbm__reg_lambda: 0.0 + lgbm__reg_alpha: 0.1 + lgbm__scale_pos_weight: 1 diff --git a/pipeline_config.py b/pipeline_config.py new file mode 100644 index 0000000..73ea34a --- /dev/null +++ b/pipeline_config.py @@ -0,0 +1,180 @@ +import os + +from attrdict import AttrDict +from deepsense import neptune + +from utils import read_params, safe_eval + +ctx = neptune.Context() +params = read_params(ctx) + +CATEGORICAL_COLUMNS = ['CODE_GENDER', + 'EMERGENCYSTATE_MODE', + 'FLAG_CONT_MOBILE', + 'FLAG_DOCUMENT_3', + 'FLAG_DOCUMENT_4', + 'FLAG_DOCUMENT_5', + 'FLAG_DOCUMENT_6', + 'FLAG_DOCUMENT_7', + 'FLAG_DOCUMENT_8', + 'FLAG_DOCUMENT_9', + 'FLAG_DOCUMENT_11', + 'FLAG_DOCUMENT_18', + 'FLAG_EMAIL', + 'FLAG_EMP_PHONE', + 'FLAG_MOBIL', + 'FLAG_OWN_CAR', + 'FLAG_OWN_REALTY', + 'FLAG_PHONE', + 'FLAG_WORK_PHONE', + 'FONDKAPREMONT_MODE', + 'HOUR_APPR_PROCESS_START', + 'HOUSETYPE_MODE', + 'LIVE_CITY_NOT_WORK_CITY', + 'LIVE_REGION_NOT_WORK_REGION', + 'NAME_CONTRACT_TYPE', + 'NAME_TYPE_SUITE', + 'NAME_INCOME_TYPE', + 'NAME_EDUCATION_TYPE', + 'NAME_FAMILY_STATUS', + 'NAME_HOUSING_TYPE', + 'OCCUPATION_TYPE', + 'ORGANIZATION_TYPE', + 'REG_CITY_NOT_LIVE_CITY', + 'REG_CITY_NOT_WORK_CITY', + 'REG_REGION_NOT_LIVE_REGION', + 'REG_REGION_NOT_WORK_REGION', + 'WALLSMATERIAL_MODE', + 'WEEKDAY_APPR_PROCESS_START'] +NUMERICAL_COLUMNS = ['AMT_ANNUITY', + 'AMT_CREDIT', + 'AMT_GOODS_PRICE', + 'AMT_INCOME_TOTAL', + 'AMT_REQ_CREDIT_BUREAU_HOUR', + 'AMT_REQ_CREDIT_BUREAU_DAY', + 'AMT_REQ_CREDIT_BUREAU_WEEK', + 'AMT_REQ_CREDIT_BUREAU_MON', + 'AMT_REQ_CREDIT_BUREAU_QRT', + 'AMT_REQ_CREDIT_BUREAU_YEAR', + 'APARTMENTS_AVG', + 'APARTMENTS_MEDI', + 'APARTMENTS_MODE', + 'BASEMENTAREA_AVG', + 'BASEMENTAREA_MEDI', + 'BASEMENTAREA_MODE', + 'COMMONAREA_AVG', + 'COMMONAREA_MEDI', + 'COMMONAREA_MODE', + 'CNT_CHILDREN', + 'CNT_FAM_MEMBERS', + 'DAYS_BIRTH', + 'DAYS_EMPLOYED', + 'DAYS_ID_PUBLISH', + 'DAYS_LAST_PHONE_CHANGE', + 'DAYS_REGISTRATION', + 'DEF_30_CNT_SOCIAL_CIRCLE', + 'DEF_60_CNT_SOCIAL_CIRCLE', + 'ELEVATORS_AVG', + 'ELEVATORS_MEDI', + 'ELEVATORS_MODE', + 'ENTRANCES_AVG', + 'ENTRANCES_MEDI', + 'ENTRANCES_MODE', + 'EXT_SOURCE_1', + 'EXT_SOURCE_2', + 'EXT_SOURCE_3', + 'FLOORSMAX_AVG', + 'FLOORSMAX_MEDI', + 'FLOORSMAX_MODE', + 'FLOORSMIN_AVG', + 'FLOORSMIN_MEDI', + 'FLOORSMIN_MODE', + 'LANDAREA_AVG', + 'LANDAREA_MEDI', + 'LANDAREA_MODE', + 'LIVINGAPARTMENTS_AVG', + 'LIVINGAPARTMENTS_MEDI', + 'LIVINGAPARTMENTS_MODE', + 'LIVINGAREA_AVG', + 'LIVINGAREA_MEDI', + 'LIVINGAREA_MODE', + 'NONLIVINGAPARTMENTS_AVG', + 'NONLIVINGAPARTMENTS_MEDI', + 'NONLIVINGAPARTMENTS_MODE', + 'NONLIVINGAREA_AVG', + 'NONLIVINGAREA_MEDI', + 'NONLIVINGAREA_MODE', + 'OBS_30_CNT_SOCIAL_CIRCLE', + 'OBS_60_CNT_SOCIAL_CIRCLE', + 'OWN_CAR_AGE', + 'REGION_POPULATION_RELATIVE', + 'REGION_RATING_CLIENT', + 'REGION_RATING_CLIENT_W_CITY', + 'TOTALAREA_MODE', + 'YEARS_BEGINEXPLUATATION_AVG', + 'YEARS_BEGINEXPLUATATION_MEDI', + 'YEARS_BEGINEXPLUATATION_MODE', + 'YEARS_BUILD_AVG', + 'YEARS_BUILD_MEDI', + 'YEARS_BUILD_MODE'] +TIMESTAMP_COLUMNS = [] +USELESS_COLUMNS = ['FLAG_DOCUMENT_10', + 'FLAG_DOCUMENT_12', + 'FLAG_DOCUMENT_13', + 'FLAG_DOCUMENT_14', + 'FLAG_DOCUMENT_15', + 'FLAG_DOCUMENT_16', + 'FLAG_DOCUMENT_17', + 'FLAG_DOCUMENT_19', + 'FLAG_DOCUMENT_2', + 'FLAG_DOCUMENT_20', + 'FLAG_DOCUMENT_21'] + +ID_COLUMNS = ['SK_ID_CURR'] +TARGET_COLUMNS = ['TARGET'] + +DEV_SAMPLE_SIZE = int(10e4) + +SOLUTION_CONFIG = AttrDict({ + 'env': {'cache_dirpath': params.experiment_dir + }, + + 'dataframe_by_type_splitter': {'numerical_columns': NUMERICAL_COLUMNS, + 'categorical_columns': CATEGORICAL_COLUMNS, + 'timestamp_columns': TIMESTAMP_COLUMNS, + }, + + 'light_gbm': {'boosting_type': safe_eval(params.lgbm__boosting_type), + 'objective': safe_eval(params.lgbm__objective), + 'metric': safe_eval(params.lgbm__metric), + 'learning_rate': safe_eval(params.lgbm__learning_rate), + 'max_depth': safe_eval(params.lgbm__max_depth), + 'subsample': safe_eval(params.lgbm__subsample), + 'colsample_bytree': safe_eval(params.lgbm__colsample_bytree), + 'min_child_weight': safe_eval(params.lgbm__min_child_weight), + 'reg_lambda': safe_eval(params.lgbm__reg_lambda), + 'reg_alpha': safe_eval(params.lgbm__reg_alpha), + 'subsample_freq': safe_eval(params.lgbm__subsample_freq), + 'max_bin': safe_eval(params.lgbm__max_bin), + 'min_child_samples': safe_eval(params.lgbm__min_child_samples), + 'num_leaves': safe_eval(params.lgbm__num_leaves), + 'nthread': safe_eval(params.num_workers), + 'number_boosting_rounds': safe_eval(params.lgbm__number_boosting_rounds), + 'early_stopping_rounds': safe_eval(params.lgbm__early_stopping_rounds), + 'verbose': safe_eval(params.verbose) + }, + + 'random_search': {'light_gbm': {'n_runs': params.lgbm_random_search_runs, + 'callbacks': {'neptune_monitor': {'name': 'light_gbm' + }, + 'save_results': {'filepath': os.path.join(params.experiment_dir, + 'random_search_light_gbm.pkl') + } + } + } + }, + + 'clipper': {'min_val': 0, + 'max_val': 1 + } +}) diff --git a/pipelines.py b/pipelines.py new file mode 100644 index 0000000..840b2c0 --- /dev/null +++ b/pipelines.py @@ -0,0 +1,248 @@ +from functools import partial + +from sklearn.metrics import roc_auc_score +from steppy.adapter import Adapter, E +from steppy.base import Step + +import feature_extraction as fe +from hyperparameter_tuning import RandomSearchOptimizer, NeptuneMonitor, SaveResults +from models import LightGBMLowMemory as LightGBM +from postprocessing import Clipper + + +def lightGBM(config, train_mode): + if train_mode: + features, features_valid = feature_extraction(config, + train_mode, + save_output=True, + cache_output=True, + load_saved_output=True) + light_gbm = classifier_lgbm((features, features_valid), + config, + train_mode) + else: + features = feature_extraction(config, + train_mode, + cache_output=True) + light_gbm = classifier_lgbm(features, + config, + train_mode) + + clipper = Step(name='clipper', + transformer=Clipper(**config.clipper), + input_steps=[light_gbm], + adapter=Adapter({'prediction': E(light_gbm.name, 'prediction')}), + cache_dirpath=config.env.cache_dirpath) + + return clipper + + +def feature_extraction(config, train_mode, **kwargs): + if train_mode: + feature_by_type_split, feature_by_type_split_valid = _feature_by_type_splits(config, train_mode) + + target_encoder, target_encoder_valid = _target_encoders((feature_by_type_split, feature_by_type_split_valid), + config, train_mode, + **kwargs) + + feature_combiner, feature_combiner_valid = _join_features(numerical_features=[feature_by_type_split], + numerical_features_valid=[feature_by_type_split_valid], + categorical_features=[target_encoder], + categorical_features_valid=[target_encoder_valid], + config=config, + train_mode=train_mode, + **kwargs) + + return feature_combiner, feature_combiner_valid + else: + feature_by_type_split = _feature_by_type_splits(config, train_mode) + + target_encoder = _target_encoders(feature_by_type_split, config, train_mode, **kwargs) + + feature_combiner = _join_features(numerical_features=[feature_by_type_split], + numerical_features_valid=[], + categorical_features=[target_encoder], + categorical_features_valid=[], + config=config, + train_mode=train_mode, + **kwargs) + + return feature_combiner + + +def _feature_by_type_splits(config, train_mode): + if train_mode: + feature_by_type_split = Step(name='feature_by_type_split', + transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + cache_dirpath=config.env.cache_dirpath) + + feature_by_type_split_valid = Step(name='feature_by_type_split_valid', + transformer=feature_by_type_split, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + cache_dirpath=config.env.cache_dirpath) + + return feature_by_type_split, feature_by_type_split_valid + + else: + feature_by_type_split = Step(name='feature_by_type_split', + transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + cache_dirpath=config.env.cache_dirpath) + + return feature_by_type_split + + +def _join_features(numerical_features, + numerical_features_valid, + categorical_features, + categorical_features_valid, + config, train_mode, + **kwargs): + if train_mode: + feature_joiner = Step(name='feature_joiner', + transformer=fe.FeatureJoiner(), + input_steps=numerical_features + categorical_features, + adapter=Adapter({ + 'numerical_feature_list': [ + E(feature.name, 'numerical_features') for feature in numerical_features], + 'categorical_feature_list': [ + E(feature.name, 'categorical_features') for feature in categorical_features], + }), + cache_dirpath=config.env.cache_dirpath, **kwargs) + + feature_joiner_valid = Step(name='feature_joiner_valid', + transformer=feature_joiner, + input_steps=numerical_features_valid + categorical_features_valid, + adapter=Adapter({ + 'numerical_feature_list': [ + E(feature.name, + 'numerical_features') for feature in numerical_features_valid], + 'categorical_feature_list': [ + E(feature.name, + 'categorical_features') for feature in categorical_features_valid], + }), + cache_dirpath=config.env.cache_dirpath, **kwargs) + + return feature_joiner, feature_joiner_valid + + else: + feature_joiner = Step(name='feature_joiner', + transformer=fe.FeatureJoiner(), + input_steps=numerical_features + categorical_features, + adapter=Adapter({ + 'numerical_feature_list': [ + E(feature.name, 'numerical_features') for feature in numerical_features], + 'categorical_feature_list': [ + E(feature.name, 'categorical_features') for feature in categorical_features], + }), + cache_dirpath=config.env.cache_dirpath, **kwargs) + + return feature_joiner + + +def classifier_lgbm(features, config, train_mode, **kwargs): + if train_mode: + features_train, features_valid = features + if config.random_search.light_gbm.n_runs: + transformer = RandomSearchOptimizer(LightGBM, config.light_gbm, + train_input_keys=[], + valid_input_keys=['X_valid', 'y_valid'], + score_func=roc_auc_score, + maximize=True, + n_runs=config.random_search.light_gbm.n_runs, + callbacks=[NeptuneMonitor( + **config.random_search.light_gbm.callbacks.neptune_monitor), + SaveResults( + **config.random_search.light_gbm.callbacks.save_results) + ]) + else: + transformer = LightGBM(**config.light_gbm) + + light_gbm = Step(name='light_gbm', + transformer=transformer, + input_data=['input'], + input_steps=[features_train, features_valid], + adapter=Adapter({'X': E(features_train.name, 'features'), + 'y': E('input', 'y'), + 'feature_names': E(features_train.name, 'feature_names'), + 'categorical_features': E(features_train.name, 'categorical_features'), + 'X_valid': E(features_valid.name, 'features'), + 'y_valid': E('input', 'y_valid'), + }), + cache_dirpath=config.env.cache_dirpath, + **kwargs) + else: + light_gbm = Step(name='light_gbm', + transformer=LightGBM(**config.light_gbm), + input_steps=[features], + adapter=Adapter({'X': E(features.name, 'features')}), + cache_dirpath=config.env.cache_dirpath, + **kwargs) + return light_gbm + + +def _target_encoders(dispatchers, config, train_mode, **kwargs): + if train_mode: + feature_by_type_split, feature_by_type_split_valid = dispatchers + numpy_label, numpy_label_valid = _to_numpy_label(config, **kwargs) + target_encoder = Step(name='target_encoder', + transformer=fe.TargetEncoder(), + input_data=['input'], + input_steps=[feature_by_type_split, numpy_label], + adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features'), + 'y': E(numpy_label.name, 'y'), + }), + cache_dirpath=config.env.cache_dirpath, + **kwargs) + + target_encoder_valid = Step(name='target_encoder_valid', + transformer=target_encoder, + input_data=['input'], + input_steps=[feature_by_type_split_valid, numpy_label_valid], + adapter=Adapter({'X': E(feature_by_type_split_valid.name, 'categorical_features'), + 'y': E(numpy_label_valid.name, 'y'), + }), + cache_dirpath=config.env.cache_dirpath, + **kwargs) + + return target_encoder, target_encoder_valid + + else: + feature_by_type_split = dispatchers + + target_encoder = Step(name='target_encoder', + transformer=fe.TargetEncoder(), + input_data=['input'], + input_steps=[feature_by_type_split], + adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features')}), + cache_dirpath=config.env.cache_dirpath, + **kwargs) + + return target_encoder + + +def _to_numpy_label(config, **kwargs): + to_numpy_label = Step(name='to_numpy_label', + transformer=fe.ToNumpyLabel(), + input_data=['input'], + adapter=Adapter({'y': [E('input', 'y')]}), + cache_dirpath=config.env.cache_dirpath, + **kwargs) + + to_numpy_label_valid = Step(name='to_numpy_label_valid', + transformer=to_numpy_label, + input_data=['input'], + adapter=Adapter({'y': [E('input', 'y_valid')]}), + cache_dirpath=config.env.cache_dirpath, + **kwargs) + + return to_numpy_label, to_numpy_label_valid + + +PIPELINES = {'lightGBM': {'train': partial(lightGBM, train_mode=True), + 'inference': partial(lightGBM, train_mode=False)}, + } diff --git a/postprocessing.py b/postprocessing.py new file mode 100644 index 0000000..0aa3d46 --- /dev/null +++ b/postprocessing.py @@ -0,0 +1,13 @@ +import numpy as np + +from steppy.base import BaseTransformer + + +class Clipper(BaseTransformer): + def __init__(self, min_val=0, max_val=1): + self.min_val = min_val + self.max_val = max_val + + def transform(self, prediction): + prediction_ = np.clip(prediction, self.min_val, self.max_val) + return {'clipped_prediction': prediction_} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ab9b40f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +neptune-cli +steppy-toolkit +steppy==0.1.1 +attrdict==2.0.0 +category_encoders==1.2.6 +click==6.7 +lightgbm==2.1.1 +numpy==1.14.3 +pandas==0.23.0 +scikit_learn==0.19.1 +PyYAML==3.12 \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..807fdc2 --- /dev/null +++ b/utils.py @@ -0,0 +1,103 @@ +import logging +import os +import random +import sys + +import numpy as np +import pandas as pd +import yaml +from attrdict import AttrDict +from sklearn.model_selection import train_test_split + + +def create_submission(meta, predictions): + submission = pd.DataFrame({'SK_ID_CURR': meta['SK_ID_CURR'].tolist(), + 'TARGET': predictions + }) + return submission + + +def verify_submission(submission, sample_submission): + + assert submission.shape == sample_submission.shape, \ + 'Expected submission to have shape {} but got {}'.format(sample_submission.shape, submission.shape) + + for submission_id, correct_id in zip(submission['SK_ID_CURR'].values, sample_submission['SK_ID_CURR'].values): + assert correct_id == submission_id, \ + 'Wrong id: expected {} but got {}'.format(correct_id, submission_id) + + +def get_logger(): + return logging.getLogger('home-credit') + + +def init_logger(): + logger = logging.getLogger('home-credit') + logger.setLevel(logging.INFO) + message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s', + datefmt='%Y-%m-%d %H-%M-%S') + + # console handler for validation info + ch_va = logging.StreamHandler(sys.stdout) + ch_va.setLevel(logging.INFO) + + ch_va.setFormatter(fmt=message_format) + + # add the handlers to the logger + logger.addHandler(ch_va) + + return logger + + +def log_loss_row(y_true, y_pred, eps=1e-15): + y_pred = np.clip(y_pred, eps, 1 - eps) + scores = y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred) + return scores + + +def read_params(ctx): + if ctx.params.__class__.__name__ == 'OfflineContextParams': + try: + neptune_config = read_yaml('neptune.yaml') + except FileNotFoundError: + neptune_config = read_yaml('../neptune.yaml') + params = neptune_config.parameters + else: + params = ctx.params + return params + + +def read_yaml(filepath): + with open(filepath) as f: + config = yaml.load(f) + return AttrDict(config) + + +def safe_eval(obj): + try: + return eval(obj) + except Exception: + return obj + + +def save_evaluation_predictions(experiment_dir, y_true, y_pred, raw_data): + raw_data['y_pred'] = y_pred + raw_data['score'] = log_loss_row(y_true, y_pred) + + raw_data.sort_values('score', ascending=False, inplace=True) + + filepath = os.path.join(experiment_dir, 'evaluation_predictions.csv') + raw_data.to_csv(filepath, index=None) + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + + +def stratified_train_valid_split(meta_train, target_column, target_bins, valid_size, random_state=1234): + y = meta_train[target_column].values + bins = np.linspace(0, y.shape[0], target_bins) + y_binned = np.digitize(y, bins) + + return train_test_split(meta_train, test_size=valid_size, stratify=y_binned, random_state=random_state)