From 0dd4b1413ad6e44bff1582b0c8d6da0d29e13e77 Mon Sep 17 00:00:00 2001
From: "Kamil A. Kaczmarek" <kamil.kaczmarek@neptune.ml>
Date: Tue, 5 Jun 2018 13:53:08 +0200
Subject: [PATCH] Dev solution 1 (#40)

* dev-Solution1 (#16)

* solution 1

* Add features

* random-search (#17)

* random-search

* Update neptune.yaml

* Solution 1 - upgrade (#18)

* Remove save_eval function

* Add more features

* Refactor code - steppy 0.1.1 (#22)

* Refactor code

* Include suggested changes

* Update models.py

* Verifying submittion (#26)

* Refactor code

* Do not save submission if dev_mode

* Verifying submittion

* Add USELESS_COLUMNS

* Include remaining features

* Fix random search

* added requirements

* optimized imports

* Fix solution-1 (#38)

* Refactor code

* Fix

* small refactor

* added neptune-cli to requirements

* added steppy-toolkit to requirements

* one line clip in postprocessing (#39)

* cleaning code for realease of the solution-1

* cleaning code for the solution-1

* cleaning code for the solution-1 (again)
---
 .gitignore                 |   1 +
 feature_extraction.py      |  90 ++++++++++++++
 hyperparameter_tuning.py   | 155 +++++++++++++++++++++++
 main.py                    | 184 +++++++++++++++++++++++++++
 models.py                  |  31 +++++
 neptune.yaml               |  57 +++++++++
 neptune_random_search.yaml |  57 +++++++++
 pipeline_config.py         | 180 +++++++++++++++++++++++++++
 pipelines.py               | 248 +++++++++++++++++++++++++++++++++++++
 postprocessing.py          |  13 ++
 requirements.txt           |  11 ++
 utils.py                   | 103 +++++++++++++++
 12 files changed, 1130 insertions(+)
 create mode 100644 feature_extraction.py
 create mode 100644 hyperparameter_tuning.py
 create mode 100644 main.py
 create mode 100644 models.py
 create mode 100644 neptune.yaml
 create mode 100644 neptune_random_search.yaml
 create mode 100644 pipeline_config.py
 create mode 100644 pipelines.py
 create mode 100644 postprocessing.py
 create mode 100644 requirements.txt
 create mode 100644 utils.py

diff --git a/.gitignore b/.gitignore
index dbb7aae..96eb0db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ tests/.cache
 .cache/
 .idea/
 .idea_modules/
+*_local.yaml
 out/
 output
 output/
diff --git a/feature_extraction.py b/feature_extraction.py
new file mode 100644
index 0000000..6d90588
--- /dev/null
+++ b/feature_extraction.py
@@ -0,0 +1,90 @@
+import category_encoders as ce
+import numpy as np
+import pandas as pd
+from sklearn.externals import joblib
+from steppy.adapters import to_numpy_label_inputs
+from steppy.base import BaseTransformer
+from steppy.utils import get_logger
+
+logger = get_logger()
+
+
+class DataFrameByTypeSplitter(BaseTransformer):
+    def __init__(self, numerical_columns, categorical_columns, timestamp_columns):
+        self.numerical_columns = numerical_columns
+        self.categorical_columns = categorical_columns
+        self.timestamp_columns = timestamp_columns
+
+    def transform(self, X, y=None, **kwargs):
+        outputs = {}
+
+        if self.numerical_columns is not None:
+            outputs['numerical_features'] = X[self.numerical_columns]
+
+        if self.categorical_columns is not None:
+            outputs['categorical_features'] = X[self.categorical_columns]
+
+        if self.timestamp_columns is not None:
+            outputs['timestamp_features'] = X[self.timestamp_columns]
+
+        return outputs
+
+
+class FeatureJoiner(BaseTransformer):
+    def transform(self, numerical_feature_list, categorical_feature_list, **kwargs):
+        features = numerical_feature_list + categorical_feature_list
+        for feature in features:
+            feature.reset_index(drop=True, inplace=True)
+        outputs = {}
+        outputs['features'] = pd.concat(features, axis=1).astype(np.float32)
+        outputs['feature_names'] = self._get_feature_names(features)
+        outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
+        return outputs
+
+    def _get_feature_names(self, dataframes):
+        feature_names = []
+        for dataframe in dataframes:
+            try:
+                feature_names.extend(list(dataframe.columns))
+            except Exception as e:
+                print(e)
+                feature_names.append(dataframe.name)
+
+        return feature_names
+
+
+class TargetEncoder(BaseTransformer):
+    def __init__(self, **kwargs):
+        self.params = kwargs
+        self.encoder_class = ce.TargetEncoder
+
+    def fit(self, X, y, **kwargs):
+        categorical_columns = list(X.columns)
+        self.target_encoder = self.encoder_class(cols=categorical_columns, **self.params)
+        self.target_encoder.fit(X, y)
+        return self
+
+    def transform(self, X, y=None, **kwargs):
+        X_ = self.target_encoder.transform(X)
+        return {'categorical_features': X_}
+
+    def load(self, filepath):
+        self.target_encoder = joblib.load(filepath)
+        return self
+
+    def save(self, filepath):
+        joblib.dump(self.target_encoder, filepath)
+
+
+class ToNumpyLabel(BaseTransformer):
+    def __init__(self, **kwargs):
+        self.y = None
+
+    def fit(self, y, **kwargs):
+        self.y = to_numpy_label_inputs(y)
+        return self
+
+    def transform(self, **kwargs):
+        if self.y.any():
+            return {'y': self.y}
+        return {}
diff --git a/hyperparameter_tuning.py b/hyperparameter_tuning.py
new file mode 100644
index 0000000..4ac478c
--- /dev/null
+++ b/hyperparameter_tuning.py
@@ -0,0 +1,155 @@
+import gc
+
+import numpy as np
+from deepsense import neptune
+from sklearn.externals import joblib
+from steppy.base import BaseTransformer
+
+from utils import set_seed
+
+
+class RandomSearchOptimizer(BaseTransformer):
+    def __init__(self, TransformerClass, params,
+                 score_func, maximize,
+                 train_input_keys, valid_input_keys,
+                 n_runs,
+                 callbacks=[]):
+        self.TransformerClass = TransformerClass
+        self.param_space = create_param_space(params, n_runs)
+        self.train_input_keys = train_input_keys
+        self.valid_input_keys = valid_input_keys
+        self.score_func = score_func
+        self.maximize = maximize
+        self.callbacks = callbacks
+        self.best_transformer = TransformerClass(**self.param_space[0])
+
+    def fit(self, **kwargs):
+        if self.train_input_keys:
+            train_inputs = {input_key: kwargs[input_key] for input_key in self.train_input_keys}
+        else:
+            train_inputs = kwargs
+        X_valid, y_valid = kwargs[self.valid_input_keys[0]], kwargs[self.valid_input_keys[1]]
+
+        results = []
+        for i, param_set in enumerate(self.param_space):
+            try:
+                transformer = self.TransformerClass(**param_set)
+                transformer.fit(**train_inputs)
+            except Exception:
+                continue
+            y_pred_valid = transformer.transform(X_valid)
+            y_pred_valid_value = list(y_pred_valid.values())[0]
+            run_score = self.score_func(y_valid, y_pred_valid_value)
+            results.append((run_score, param_set))
+
+            del y_pred_valid, transformer
+            gc.collect()
+
+            for callback in self.callbacks:
+                callback.on_run_end(score=run_score, params=param_set)
+
+        assert len(results) > 0, 'All random search runs failed, check your parameter space'
+        results_sorted = sorted(results, key=lambda x: x[0])
+
+        if self.maximize:
+            best_score, best_param_set = results_sorted[-1]
+        else:
+            best_score, best_param_set = results_sorted[0]
+
+        for callback in self.callbacks:
+            callback.on_search_end(results=results)
+
+        self.best_transformer = self.TransformerClass(**best_param_set)
+        self.best_transformer.fit(**train_inputs)
+        return self
+
+    def transform(self, **kwargs):
+        return self.best_transformer.transform(**kwargs)
+
+    def save(self, filepath):
+        self.best_transformer.save(filepath)
+
+    def load(self, filepath):
+        self.best_transformer.load(filepath)
+        return self
+
+
+def create_param_space(params, n_runs):
+    seed = np.random.randint(1000)
+    param_space = []
+    for i in range(n_runs):
+        set_seed(seed + i)
+        param_choice = {}
+        for param, value in params.items():
+            if isinstance(value, list):
+                if len(value) == 2:
+                    mode = 'choice'
+                    param_choice[param] = sample_param_space(value, mode)
+                else:
+                    mode = value[-1]
+                    param_choice[param] = sample_param_space(value[:-1], mode)
+            else:
+                param_choice[param] = value
+        param_space.append(param_choice)
+    return param_space
+
+
+def sample_param_space(value_range, mode):
+    if mode == 'list':
+        value = np.random.choice(value_range)
+    else:
+        range_min, range_max = value_range
+        if mode == 'choice':
+            value = np.random.choice(range(range_min, range_max, 1))
+        elif mode == 'uniform':
+            value = np.random.uniform(low=range_min, high=range_max)
+        elif mode == 'log-uniform':
+            value = np.exp(np.random.uniform(low=np.log(range_min), high=np.log(range_max)))
+        else:
+            raise NotImplementedError
+    return value
+
+
+class GridSearchCallback:
+    def on_run_end(self, score, params):
+        return NotImplementedError
+
+    def on_search_end(self, results):
+        return NotImplementedError
+
+
+class NeptuneMonitor(GridSearchCallback):
+    def __init__(self, name):
+        self.name = name
+        self.ctx = neptune.Context()
+        self.highest_params_channel = self._create_text_channel(name='highest params')
+        self.lowest_params_channel = self._create_text_channel(name='lowest params')
+        self.run_params_channel = self._create_text_channel(name='run params')
+        self.run_id = 0
+
+    def on_run_end(self, score, params):
+        self.ctx.channel_send('score on run', x=self.run_id, y=score)
+        self.run_params_channel.send(y=params)
+        self.run_id += 1
+
+    def on_search_end(self, results):
+        results_sorted = sorted(results, key=lambda x: x[0])
+        highest_score, highest_param_set = results_sorted[-1]
+        lowest_score, lowest_param_set = results_sorted[0]
+
+        self.ctx.channel_send('highest score', x=0, y=highest_score)
+        self.ctx.channel_send('lowest score', x=0, y=lowest_score)
+
+        self.highest_params_channel.send(y=highest_param_set)
+        self.lowest_params_channel.send(y=lowest_param_set)
+
+    def _create_text_channel(self, name=''):
+        return self.ctx.create_channel(name=name, channel_type=neptune.ChannelType.TEXT)
+
+
+class SaveResults(GridSearchCallback):
+    def __init__(self, filepath):
+        self.filepath = filepath
+
+    def on_search_end(self, results):
+        joblib.dump(results, self.filepath)
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..536e417
--- /dev/null
+++ b/main.py
@@ -0,0 +1,184 @@
+import os
+import shutil
+
+import click
+import pandas as pd
+from deepsense import neptune
+from sklearn.metrics import roc_auc_score
+
+import pipeline_config as cfg
+from pipelines import PIPELINES
+from utils import create_submission, init_logger, read_params, save_evaluation_predictions, \
+    set_seed, stratified_train_valid_split, verify_submission
+
+set_seed(1234)
+logger = init_logger()
+ctx = neptune.Context()
+params = read_params(ctx)
+
+
+@click.group()
+def action():
+    pass
+
+
+@action.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train(pipeline_name, dev_mode):
+    _train(pipeline_name, dev_mode)
+
+
+def _train(pipeline_name, dev_mode):
+    if bool(params.overwrite) and os.path.isdir(params.experiment_dir):
+        shutil.rmtree(params.experiment_dir)
+
+    logger.info('reading data in')
+    if dev_mode:
+        meta_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
+    else:
+        meta_train = pd.read_csv(params.train_filepath)
+
+    meta_train_split, meta_valid_split = stratified_train_valid_split(meta_train,
+                                                                      target_column=cfg.TARGET_COLUMNS,
+                                                                      target_bins=params.target_bins,
+                                                                      valid_size=params.validation_size,
+                                                                      random_state=1234)
+
+    logger.info('Target distribution in train: {}'.format(meta_train_split[cfg.TARGET_COLUMNS].mean()))
+    logger.info('Target distribution in valid: {}'.format(meta_valid_split[cfg.TARGET_COLUMNS].mean()))
+
+    logger.info('shuffling data')
+    meta_train_split = meta_train_split.sample(frac=1)
+    meta_valid_split = meta_valid_split.sample(frac=1)
+
+    data = {'input': {'X': meta_train_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                      'y': meta_train_split[cfg.TARGET_COLUMNS],
+                      'X_valid': meta_valid_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                      'y_valid': meta_valid_split[cfg.TARGET_COLUMNS],
+                      },
+            }
+
+    pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG)
+    pipeline.clean_cache()
+    pipeline.fit_transform(data)
+    pipeline.clean_cache()
+
+
+@action.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def evaluate(pipeline_name, dev_mode):
+    _evaluate(pipeline_name, dev_mode)
+
+
+def _evaluate(pipeline_name, dev_mode):
+    logger.info('reading data in')
+    if dev_mode:
+        meta_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
+    else:
+        meta_train = pd.read_csv(params.train_filepath)
+
+    _, meta_valid_split = stratified_train_valid_split(meta_train,
+                                                       target_column=cfg.TARGET_COLUMNS,
+                                                       target_bins=params.target_bins,
+                                                       valid_size=params.validation_size,
+                                                       random_state=1234)
+
+    logger.info('Target distribution in valid: {}'.format(meta_valid_split[cfg.TARGET_COLUMNS].mean()))
+
+    data = {'input': {'X': meta_valid_split,
+                      'y': None,
+                      },
+            }
+    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
+    pipeline.clean_cache()
+    output = pipeline.transform(data)
+    pipeline.clean_cache()
+    y_pred = output['clipped_prediction']
+    y_true = meta_valid_split[cfg.TARGET_COLUMNS].values.reshape(-1)
+
+    logger.info('Saving evaluation predictions')
+    save_evaluation_predictions(params.experiment_dir, y_true, y_pred, meta_valid_split)
+
+    logger.info('Calculating ROC_AUC Full Scores')
+    score = roc_auc_score(y_true, y_pred)
+    logger.info('ROC_AUC score on validation is {}'.format(score))
+    ctx.channel_send('ROC_AUC', 0, score)
+
+
+@action.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def predict(pipeline_name, dev_mode):
+    _predict(pipeline_name, dev_mode)
+
+
+def _predict(pipeline_name, dev_mode):
+    logger.info('reading data in')
+    if dev_mode:
+        meta_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
+    else:
+        meta_test = pd.read_csv(params.test_filepath)
+
+    data = {'input': {'X': meta_test,
+                      'y': None,
+                      },
+            }
+
+    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
+    pipeline.clean_cache()
+    output = pipeline.transform(data)
+    pipeline.clean_cache()
+    y_pred = output['clipped_prediction']
+
+    logger.info('creating submission...')
+    submission = create_submission(meta_test, y_pred)
+
+    logger.info('verifying submittion')
+    sample_submission = pd.read_csv(params.sample_submission_filepath)
+    verify_submission(submission, sample_submission)
+
+    if dev_mode:
+        logger.info('submittion can\'t be saved in dev mode')
+    else:
+        submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
+        submission.to_csv(submission_filepath, index=None, encoding='utf-8')
+        logger.info('submission saved to {}'.format(submission_filepath))
+        logger.info('submission head \n\n{}'.format(submission.head()))
+
+
+@action.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train_evaluate_predict(pipeline_name, dev_mode):
+    logger.info('TRAINING')
+    _train(pipeline_name, dev_mode)
+    logger.info('EVALUATION')
+    _evaluate(pipeline_name, dev_mode)
+    logger.info('PREDICTION')
+    _predict(pipeline_name, dev_mode)
+
+
+@action.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def evaluate_predict(pipeline_name, dev_mode):
+    logger.info('EVALUATION')
+    _evaluate(pipeline_name, dev_mode)
+    logger.info('PREDICTION')
+    _predict(pipeline_name, dev_mode)
+
+
+@action.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train_evaluate(pipeline_name, dev_mode):
+    logger.info('TRAINING')
+    _train(pipeline_name, dev_mode)
+    logger.info('EVALUATION')
+    _evaluate(pipeline_name, dev_mode)
+
+
+if __name__ == "__main__":
+    action()
diff --git a/models.py b/models.py
new file mode 100644
index 0000000..705ba45
--- /dev/null
+++ b/models.py
@@ -0,0 +1,31 @@
+import lightgbm as lgb
+import numpy as np
+from steppy.adapters import to_numpy_label_inputs
+from toolkit.misc import LightGBM
+
+
+class LightGBMLowMemory(LightGBM):
+    def fit(self, X, y, X_valid, y_valid, feature_names=None, categorical_features=None, **kwargs):
+        y = to_numpy_label_inputs([y])
+        y_valid = to_numpy_label_inputs([y_valid])
+
+        X = X[feature_names].values.astype(np.float32)
+        y = y.astype(np.float32)
+
+        X_valid = X_valid[feature_names].values.astype(np.float32)
+        y_valid = y_valid.astype(np.float32)
+
+        train = lgb.Dataset(X, label=y)
+        valid = lgb.Dataset(X_valid, label=y_valid)
+
+        self.evaluation_results = {}
+        self.estimator = lgb.train(self.model_config,
+                                   train, valid_sets=[valid], valid_names=['valid'],
+                                   feature_name=feature_names,
+                                   categorical_feature=categorical_features,
+                                   evals_result=self.evaluation_results,
+                                   num_boost_round=self.training_config.number_boosting_rounds,
+                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
+                                   verbose_eval=self.model_config.verbose,
+                                   feval=self.evaluation_function)
+        return self
diff --git a/neptune.yaml b/neptune.yaml
new file mode 100644
index 0000000..aa6b307
--- /dev/null
+++ b/neptune.yaml
@@ -0,0 +1,57 @@
+project: HC
+
+name: home-credit-default-risk
+tags: [solution-1]
+
+metric:
+  channel: 'ROC_AUC'
+  goal: maximize
+
+exclude:
+  - output
+  - imgs
+  - neptune.log
+  - offline_job.log
+  - .git
+  - .idea
+  - .ipynb_checkpoints
+  - Untitled.ipynb
+
+parameters:
+# Data
+  train_filepath:             YOUR/PATH/TO/application_train.csv
+  test_filepath:              YOUR/PATH/TO/application_test.csv
+  sample_submission_filepath: YOUR/PATH/TO/sample_submission.csv
+  experiment_dir:             YOUR/PATH/WORKDIR
+
+# Validation
+  target_bins: 100
+  validation_size: 0.2
+
+# Execution
+  overwrite: 1
+  num_workers: 10
+  verbose: 1
+
+# Preprocessing
+  target_encoder__n_splits: 10
+
+# Light GBM
+  lgbm_random_search_runs: 0
+  lgbm__boosting_type: gbdt
+  lgbm__objective: binary
+  lgbm__metric: auc
+  lgbm__number_boosting_rounds: 500
+  lgbm__early_stopping_rounds: 30
+  lgbm__learning_rate: 0.05
+  lgbm__num_leaves: 31
+  lgbm__max_depth: 16
+  lgbm__min_child_samples: 42
+  lgbm__max_bin: 300
+  lgbm__subsample: 0.8
+  lgbm__subsample_freq: 5
+  lgbm__colsample_bytree: 0.8
+  lgbm__min_child_weight: 4
+  lgbm__reg_lambda: 0.0
+  lgbm__reg_alpha: 0.1
+  lgbm__scale_pos_weight: 1
diff --git a/neptune_random_search.yaml b/neptune_random_search.yaml
new file mode 100644
index 0000000..af1ac4d
--- /dev/null
+++ b/neptune_random_search.yaml
@@ -0,0 +1,57 @@
+project: HC
+
+name: home-credit-default-risk
+tags: [solution-1]
+
+metric:
+  channel: 'ROC_AUC'
+  goal: maximize
+
+exclude:
+  - output
+  - imgs
+  - neptune.log
+  - offline_job.log
+  - .git
+  - .idea
+  - .ipynb_checkpoints
+  - Untitled.ipynb
+
+parameters:
+# Data
+  train_filepath:             YOUR/PATH/TO/application_train.csv
+  test_filepath:              YOUR/PATH/TO/application_test.csv
+  sample_submission_filepath: YOUR/PATH/TO/sample_submission.csv
+  experiment_dir:             YOUR/PATH/WORKDIR
+
+# Validation
+  target_bins: 100
+  validation_size: 0.2
+
+# Execution
+  overwrite: 1
+  num_workers: 10
+  verbose: 1
+
+# Preprocessing
+  target_encoder__n_splits: 10
+
+# Light GBM
+  lgbm_random_search_runs: 10
+  lgbm__boosting_type: gbdt
+  lgbm__objective: binary
+  lgbm__metric: auc
+  lgbm__number_boosting_rounds: 1000
+  lgbm__early_stopping_rounds: 30
+  lgbm__learning_rate: '[0.01, 0.2, "uniform"]'
+  lgbm__num_leaves: '[25, 35]'
+  lgbm__max_depth: '[10, 20]'
+  lgbm__min_child_samples: '[35, 45]'
+  lgbm__max_bin: '[275, 325]'
+  lgbm__subsample: '[0.8, 1., 1.2, "list"]'
+  lgbm__subsample_freq: 5
+  lgbm__colsample_bytree: 0.8
+  lgbm__min_child_weight: 4
+  lgbm__reg_lambda: 0.0
+  lgbm__reg_alpha: 0.1
+  lgbm__scale_pos_weight: 1
diff --git a/pipeline_config.py b/pipeline_config.py
new file mode 100644
index 0000000..73ea34a
--- /dev/null
+++ b/pipeline_config.py
@@ -0,0 +1,180 @@
+import os
+
+from attrdict import AttrDict
+from deepsense import neptune
+
+from utils import read_params, safe_eval
+
+ctx = neptune.Context()
+params = read_params(ctx)
+
+CATEGORICAL_COLUMNS = ['CODE_GENDER',
+                       'EMERGENCYSTATE_MODE',
+                       'FLAG_CONT_MOBILE',
+                       'FLAG_DOCUMENT_3',
+                       'FLAG_DOCUMENT_4',
+                       'FLAG_DOCUMENT_5',
+                       'FLAG_DOCUMENT_6',
+                       'FLAG_DOCUMENT_7',
+                       'FLAG_DOCUMENT_8',
+                       'FLAG_DOCUMENT_9',
+                       'FLAG_DOCUMENT_11',
+                       'FLAG_DOCUMENT_18',
+                       'FLAG_EMAIL',
+                       'FLAG_EMP_PHONE',
+                       'FLAG_MOBIL',
+                       'FLAG_OWN_CAR',
+                       'FLAG_OWN_REALTY',
+                       'FLAG_PHONE',
+                       'FLAG_WORK_PHONE',
+                       'FONDKAPREMONT_MODE',
+                       'HOUR_APPR_PROCESS_START',
+                       'HOUSETYPE_MODE',
+                       'LIVE_CITY_NOT_WORK_CITY',
+                       'LIVE_REGION_NOT_WORK_REGION',
+                       'NAME_CONTRACT_TYPE',
+                       'NAME_TYPE_SUITE',
+                       'NAME_INCOME_TYPE',
+                       'NAME_EDUCATION_TYPE',
+                       'NAME_FAMILY_STATUS',
+                       'NAME_HOUSING_TYPE',
+                       'OCCUPATION_TYPE',
+                       'ORGANIZATION_TYPE',
+                       'REG_CITY_NOT_LIVE_CITY',
+                       'REG_CITY_NOT_WORK_CITY',
+                       'REG_REGION_NOT_LIVE_REGION',
+                       'REG_REGION_NOT_WORK_REGION',
+                       'WALLSMATERIAL_MODE',
+                       'WEEKDAY_APPR_PROCESS_START']
+NUMERICAL_COLUMNS = ['AMT_ANNUITY',
+                     'AMT_CREDIT',
+                     'AMT_GOODS_PRICE',
+                     'AMT_INCOME_TOTAL',
+                     'AMT_REQ_CREDIT_BUREAU_HOUR',
+                     'AMT_REQ_CREDIT_BUREAU_DAY',
+                     'AMT_REQ_CREDIT_BUREAU_WEEK',
+                     'AMT_REQ_CREDIT_BUREAU_MON',
+                     'AMT_REQ_CREDIT_BUREAU_QRT',
+                     'AMT_REQ_CREDIT_BUREAU_YEAR',
+                     'APARTMENTS_AVG',
+                     'APARTMENTS_MEDI',
+                     'APARTMENTS_MODE',
+                     'BASEMENTAREA_AVG',
+                     'BASEMENTAREA_MEDI',
+                     'BASEMENTAREA_MODE',
+                     'COMMONAREA_AVG',
+                     'COMMONAREA_MEDI',
+                     'COMMONAREA_MODE',
+                     'CNT_CHILDREN',
+                     'CNT_FAM_MEMBERS',
+                     'DAYS_BIRTH',
+                     'DAYS_EMPLOYED',
+                     'DAYS_ID_PUBLISH',
+                     'DAYS_LAST_PHONE_CHANGE',
+                     'DAYS_REGISTRATION',
+                     'DEF_30_CNT_SOCIAL_CIRCLE',
+                     'DEF_60_CNT_SOCIAL_CIRCLE',
+                     'ELEVATORS_AVG',
+                     'ELEVATORS_MEDI',
+                     'ELEVATORS_MODE',
+                     'ENTRANCES_AVG',
+                     'ENTRANCES_MEDI',
+                     'ENTRANCES_MODE',
+                     'EXT_SOURCE_1',
+                     'EXT_SOURCE_2',
+                     'EXT_SOURCE_3',
+                     'FLOORSMAX_AVG',
+                     'FLOORSMAX_MEDI',
+                     'FLOORSMAX_MODE',
+                     'FLOORSMIN_AVG',
+                     'FLOORSMIN_MEDI',
+                     'FLOORSMIN_MODE',
+                     'LANDAREA_AVG',
+                     'LANDAREA_MEDI',
+                     'LANDAREA_MODE',
+                     'LIVINGAPARTMENTS_AVG',
+                     'LIVINGAPARTMENTS_MEDI',
+                     'LIVINGAPARTMENTS_MODE',
+                     'LIVINGAREA_AVG',
+                     'LIVINGAREA_MEDI',
+                     'LIVINGAREA_MODE',
+                     'NONLIVINGAPARTMENTS_AVG',
+                     'NONLIVINGAPARTMENTS_MEDI',
+                     'NONLIVINGAPARTMENTS_MODE',
+                     'NONLIVINGAREA_AVG',
+                     'NONLIVINGAREA_MEDI',
+                     'NONLIVINGAREA_MODE',
+                     'OBS_30_CNT_SOCIAL_CIRCLE',
+                     'OBS_60_CNT_SOCIAL_CIRCLE',
+                     'OWN_CAR_AGE',
+                     'REGION_POPULATION_RELATIVE',
+                     'REGION_RATING_CLIENT',
+                     'REGION_RATING_CLIENT_W_CITY',
+                     'TOTALAREA_MODE',
+                     'YEARS_BEGINEXPLUATATION_AVG',
+                     'YEARS_BEGINEXPLUATATION_MEDI',
+                     'YEARS_BEGINEXPLUATATION_MODE',
+                     'YEARS_BUILD_AVG',
+                     'YEARS_BUILD_MEDI',
+                     'YEARS_BUILD_MODE']
+TIMESTAMP_COLUMNS = []
+USELESS_COLUMNS = ['FLAG_DOCUMENT_10',
+                   'FLAG_DOCUMENT_12',
+                   'FLAG_DOCUMENT_13',
+                   'FLAG_DOCUMENT_14',
+                   'FLAG_DOCUMENT_15',
+                   'FLAG_DOCUMENT_16',
+                   'FLAG_DOCUMENT_17',
+                   'FLAG_DOCUMENT_19',
+                   'FLAG_DOCUMENT_2',
+                   'FLAG_DOCUMENT_20',
+                   'FLAG_DOCUMENT_21']
+
+ID_COLUMNS = ['SK_ID_CURR']
+TARGET_COLUMNS = ['TARGET']
+
+DEV_SAMPLE_SIZE = int(10e4)
+
+SOLUTION_CONFIG = AttrDict({
+    'env': {'cache_dirpath': params.experiment_dir
+            },
+
+    'dataframe_by_type_splitter': {'numerical_columns': NUMERICAL_COLUMNS,
+                                   'categorical_columns': CATEGORICAL_COLUMNS,
+                                   'timestamp_columns': TIMESTAMP_COLUMNS,
+                                   },
+
+    'light_gbm': {'boosting_type': safe_eval(params.lgbm__boosting_type),
+                  'objective': safe_eval(params.lgbm__objective),
+                  'metric': safe_eval(params.lgbm__metric),
+                  'learning_rate': safe_eval(params.lgbm__learning_rate),
+                  'max_depth': safe_eval(params.lgbm__max_depth),
+                  'subsample': safe_eval(params.lgbm__subsample),
+                  'colsample_bytree': safe_eval(params.lgbm__colsample_bytree),
+                  'min_child_weight': safe_eval(params.lgbm__min_child_weight),
+                  'reg_lambda': safe_eval(params.lgbm__reg_lambda),
+                  'reg_alpha': safe_eval(params.lgbm__reg_alpha),
+                  'subsample_freq': safe_eval(params.lgbm__subsample_freq),
+                  'max_bin': safe_eval(params.lgbm__max_bin),
+                  'min_child_samples': safe_eval(params.lgbm__min_child_samples),
+                  'num_leaves': safe_eval(params.lgbm__num_leaves),
+                  'nthread': safe_eval(params.num_workers),
+                  'number_boosting_rounds': safe_eval(params.lgbm__number_boosting_rounds),
+                  'early_stopping_rounds': safe_eval(params.lgbm__early_stopping_rounds),
+                  'verbose': safe_eval(params.verbose)
+                  },
+
+    'random_search': {'light_gbm': {'n_runs': params.lgbm_random_search_runs,
+                                    'callbacks': {'neptune_monitor': {'name': 'light_gbm'
+                                                                      },
+                                                  'save_results': {'filepath': os.path.join(params.experiment_dir,
+                                                                                            'random_search_light_gbm.pkl')
+                                                                   }
+                                                  }
+                                    }
+                      },
+
+    'clipper': {'min_val': 0,
+                'max_val': 1
+                }
+})
diff --git a/pipelines.py b/pipelines.py
new file mode 100644
index 0000000..840b2c0
--- /dev/null
+++ b/pipelines.py
@@ -0,0 +1,248 @@
+from functools import partial
+
+from sklearn.metrics import roc_auc_score
+from steppy.adapter import Adapter, E
+from steppy.base import Step
+
+import feature_extraction as fe
+from hyperparameter_tuning import RandomSearchOptimizer, NeptuneMonitor, SaveResults
+from models import LightGBMLowMemory as LightGBM
+from postprocessing import Clipper
+
+
+def lightGBM(config, train_mode):
+    if train_mode:
+        features, features_valid = feature_extraction(config,
+                                                      train_mode,
+                                                      save_output=True,
+                                                      cache_output=True,
+                                                      load_saved_output=True)
+        light_gbm = classifier_lgbm((features, features_valid),
+                                    config,
+                                    train_mode)
+    else:
+        features = feature_extraction(config,
+                                      train_mode,
+                                      cache_output=True)
+        light_gbm = classifier_lgbm(features,
+                                    config,
+                                    train_mode)
+
+    clipper = Step(name='clipper',
+                   transformer=Clipper(**config.clipper),
+                   input_steps=[light_gbm],
+                   adapter=Adapter({'prediction': E(light_gbm.name, 'prediction')}),
+                   cache_dirpath=config.env.cache_dirpath)
+
+    return clipper
+
+
+def feature_extraction(config, train_mode, **kwargs):
+    if train_mode:
+        feature_by_type_split, feature_by_type_split_valid = _feature_by_type_splits(config, train_mode)
+
+        target_encoder, target_encoder_valid = _target_encoders((feature_by_type_split, feature_by_type_split_valid),
+                                                                config, train_mode,
+                                                                **kwargs)
+
+        feature_combiner, feature_combiner_valid = _join_features(numerical_features=[feature_by_type_split],
+                                                                  numerical_features_valid=[feature_by_type_split_valid],
+                                                                  categorical_features=[target_encoder],
+                                                                  categorical_features_valid=[target_encoder_valid],
+                                                                  config=config,
+                                                                  train_mode=train_mode,
+                                                                  **kwargs)
+
+        return feature_combiner, feature_combiner_valid
+    else:
+        feature_by_type_split = _feature_by_type_splits(config, train_mode)
+
+        target_encoder = _target_encoders(feature_by_type_split, config, train_mode, **kwargs)
+
+        feature_combiner = _join_features(numerical_features=[feature_by_type_split],
+                                          numerical_features_valid=[],
+                                          categorical_features=[target_encoder],
+                                          categorical_features_valid=[],
+                                          config=config,
+                                          train_mode=train_mode,
+                                          **kwargs)
+
+        return feature_combiner
+
+
+def _feature_by_type_splits(config, train_mode):
+    if train_mode:
+        feature_by_type_split = Step(name='feature_by_type_split',
+                                     transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter),
+                                     input_data=['input'],
+                                     adapter=Adapter({'X': E('input', 'X')}),
+                                     cache_dirpath=config.env.cache_dirpath)
+
+        feature_by_type_split_valid = Step(name='feature_by_type_split_valid',
+                                           transformer=feature_by_type_split,
+                                           input_data=['input'],
+                                           adapter=Adapter({'X': E('input', 'X_valid')}),
+                                           cache_dirpath=config.env.cache_dirpath)
+
+        return feature_by_type_split, feature_by_type_split_valid
+
+    else:
+        feature_by_type_split = Step(name='feature_by_type_split',
+                                     transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter),
+                                     input_data=['input'],
+                                     adapter=Adapter({'X': E('input', 'X')}),
+                                     cache_dirpath=config.env.cache_dirpath)
+
+    return feature_by_type_split
+
+
+def _join_features(numerical_features,
+                   numerical_features_valid,
+                   categorical_features,
+                   categorical_features_valid,
+                   config, train_mode,
+                   **kwargs):
+    if train_mode:
+        feature_joiner = Step(name='feature_joiner',
+                              transformer=fe.FeatureJoiner(),
+                              input_steps=numerical_features + categorical_features,
+                              adapter=Adapter({
+                                  'numerical_feature_list': [
+                                      E(feature.name, 'numerical_features') for feature in numerical_features],
+                                  'categorical_feature_list': [
+                                      E(feature.name, 'categorical_features') for feature in categorical_features],
+                              }),
+                              cache_dirpath=config.env.cache_dirpath, **kwargs)
+
+        feature_joiner_valid = Step(name='feature_joiner_valid',
+                                    transformer=feature_joiner,
+                                    input_steps=numerical_features_valid + categorical_features_valid,
+                                    adapter=Adapter({
+                                        'numerical_feature_list': [
+                                            E(feature.name,
+                                              'numerical_features') for feature in numerical_features_valid],
+                                        'categorical_feature_list': [
+                                            E(feature.name,
+                                              'categorical_features') for feature in categorical_features_valid],
+                                    }),
+                                    cache_dirpath=config.env.cache_dirpath, **kwargs)
+
+        return feature_joiner, feature_joiner_valid
+
+    else:
+        feature_joiner = Step(name='feature_joiner',
+                              transformer=fe.FeatureJoiner(),
+                              input_steps=numerical_features + categorical_features,
+                              adapter=Adapter({
+                                  'numerical_feature_list': [
+                                      E(feature.name, 'numerical_features') for feature in numerical_features],
+                                  'categorical_feature_list': [
+                                      E(feature.name, 'categorical_features') for feature in categorical_features],
+                              }),
+                              cache_dirpath=config.env.cache_dirpath, **kwargs)
+
+    return feature_joiner
+
+
+def classifier_lgbm(features, config, train_mode, **kwargs):
+    if train_mode:
+        features_train, features_valid = features
+        if config.random_search.light_gbm.n_runs:
+            transformer = RandomSearchOptimizer(LightGBM, config.light_gbm,
+                                                train_input_keys=[],
+                                                valid_input_keys=['X_valid', 'y_valid'],
+                                                score_func=roc_auc_score,
+                                                maximize=True,
+                                                n_runs=config.random_search.light_gbm.n_runs,
+                                                callbacks=[NeptuneMonitor(
+                                                    **config.random_search.light_gbm.callbacks.neptune_monitor),
+                                                    SaveResults(
+                                                        **config.random_search.light_gbm.callbacks.save_results)
+                                                ])
+        else:
+            transformer = LightGBM(**config.light_gbm)
+
+        light_gbm = Step(name='light_gbm',
+                         transformer=transformer,
+                         input_data=['input'],
+                         input_steps=[features_train, features_valid],
+                         adapter=Adapter({'X': E(features_train.name, 'features'),
+                                          'y': E('input', 'y'),
+                                          'feature_names': E(features_train.name, 'feature_names'),
+                                          'categorical_features': E(features_train.name, 'categorical_features'),
+                                          'X_valid': E(features_valid.name, 'features'),
+                                          'y_valid': E('input', 'y_valid'),
+                                          }),
+                         cache_dirpath=config.env.cache_dirpath,
+                         **kwargs)
+    else:
+        light_gbm = Step(name='light_gbm',
+                         transformer=LightGBM(**config.light_gbm),
+                         input_steps=[features],
+                         adapter=Adapter({'X': E(features.name, 'features')}),
+                         cache_dirpath=config.env.cache_dirpath,
+                         **kwargs)
+    return light_gbm
+
+
+def _target_encoders(dispatchers, config, train_mode, **kwargs):
+    if train_mode:
+        feature_by_type_split, feature_by_type_split_valid = dispatchers
+        numpy_label, numpy_label_valid = _to_numpy_label(config, **kwargs)
+        target_encoder = Step(name='target_encoder',
+                              transformer=fe.TargetEncoder(),
+                              input_data=['input'],
+                              input_steps=[feature_by_type_split, numpy_label],
+                              adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features'),
+                                               'y': E(numpy_label.name, 'y'),
+                                               }),
+                              cache_dirpath=config.env.cache_dirpath,
+                              **kwargs)
+
+        target_encoder_valid = Step(name='target_encoder_valid',
+                                    transformer=target_encoder,
+                                    input_data=['input'],
+                                    input_steps=[feature_by_type_split_valid, numpy_label_valid],
+                                    adapter=Adapter({'X': E(feature_by_type_split_valid.name, 'categorical_features'),
+                                                     'y': E(numpy_label_valid.name, 'y'),
+                                                     }),
+                                    cache_dirpath=config.env.cache_dirpath,
+                                    **kwargs)
+
+        return target_encoder, target_encoder_valid
+
+    else:
+        feature_by_type_split = dispatchers
+
+        target_encoder = Step(name='target_encoder',
+                              transformer=fe.TargetEncoder(),
+                              input_data=['input'],
+                              input_steps=[feature_by_type_split],
+                              adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features')}),
+                              cache_dirpath=config.env.cache_dirpath,
+                              **kwargs)
+
+        return target_encoder
+
+
+def _to_numpy_label(config, **kwargs):
+    to_numpy_label = Step(name='to_numpy_label',
+                          transformer=fe.ToNumpyLabel(),
+                          input_data=['input'],
+                          adapter=Adapter({'y': [E('input', 'y')]}),
+                          cache_dirpath=config.env.cache_dirpath,
+                          **kwargs)
+
+    to_numpy_label_valid = Step(name='to_numpy_label_valid',
+                                transformer=to_numpy_label,
+                                input_data=['input'],
+                                adapter=Adapter({'y': [E('input', 'y_valid')]}),
+                                cache_dirpath=config.env.cache_dirpath,
+                                **kwargs)
+
+    return to_numpy_label, to_numpy_label_valid
+
+
+PIPELINES = {'lightGBM': {'train': partial(lightGBM, train_mode=True),
+                          'inference': partial(lightGBM, train_mode=False)},
+             }
diff --git a/postprocessing.py b/postprocessing.py
new file mode 100644
index 0000000..0aa3d46
--- /dev/null
+++ b/postprocessing.py
@@ -0,0 +1,13 @@
+import numpy as np
+
+from steppy.base import BaseTransformer
+
+
+class Clipper(BaseTransformer):
+    def __init__(self, min_val=0, max_val=1):
+        self.min_val = min_val
+        self.max_val = max_val
+
+    def transform(self, prediction):
+        prediction_ = np.clip(prediction, self.min_val, self.max_val)
+        return {'clipped_prediction': prediction_}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ab9b40f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+neptune-cli
+steppy-toolkit
+steppy==0.1.1
+attrdict==2.0.0
+category_encoders==1.2.6
+click==6.7
+lightgbm==2.1.1
+numpy==1.14.3
+pandas==0.23.0
+scikit_learn==0.19.1
+PyYAML==3.12
\ No newline at end of file
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..807fdc2
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,103 @@
+import logging
+import os
+import random
+import sys
+
+import numpy as np
+import pandas as pd
+import yaml
+from attrdict import AttrDict
+from sklearn.model_selection import train_test_split
+
+
+def create_submission(meta, predictions):
+    submission = pd.DataFrame({'SK_ID_CURR': meta['SK_ID_CURR'].tolist(),
+                               'TARGET': predictions
+                               })
+    return submission
+
+
+def verify_submission(submission, sample_submission):
+
+    assert submission.shape == sample_submission.shape, \
+        'Expected submission to have shape {} but got {}'.format(sample_submission.shape, submission.shape)
+
+    for submission_id, correct_id in zip(submission['SK_ID_CURR'].values, sample_submission['SK_ID_CURR'].values):
+        assert correct_id == submission_id, \
+            'Wrong id: expected {} but got {}'.format(correct_id, submission_id)
+
+
+def get_logger():
+    return logging.getLogger('home-credit')
+
+
+def init_logger():
+    logger = logging.getLogger('home-credit')
+    logger.setLevel(logging.INFO)
+    message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s',
+                                       datefmt='%Y-%m-%d %H-%M-%S')
+
+    # console handler for validation info
+    ch_va = logging.StreamHandler(sys.stdout)
+    ch_va.setLevel(logging.INFO)
+
+    ch_va.setFormatter(fmt=message_format)
+
+    # add the handlers to the logger
+    logger.addHandler(ch_va)
+
+    return logger
+
+
+def log_loss_row(y_true, y_pred, eps=1e-15):
+    y_pred = np.clip(y_pred, eps, 1 - eps)
+    scores = y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
+    return scores
+
+
+def read_params(ctx):
+    if ctx.params.__class__.__name__ == 'OfflineContextParams':
+        try:
+            neptune_config = read_yaml('neptune.yaml')
+        except FileNotFoundError:
+            neptune_config = read_yaml('../neptune.yaml')
+        params = neptune_config.parameters
+    else:
+        params = ctx.params
+    return params
+
+
+def read_yaml(filepath):
+    with open(filepath) as f:
+        config = yaml.load(f)
+    return AttrDict(config)
+
+
+def safe_eval(obj):
+    try:
+        return eval(obj)
+    except Exception:
+        return obj
+
+
+def save_evaluation_predictions(experiment_dir, y_true, y_pred, raw_data):
+    raw_data['y_pred'] = y_pred
+    raw_data['score'] = log_loss_row(y_true, y_pred)
+
+    raw_data.sort_values('score', ascending=False, inplace=True)
+
+    filepath = os.path.join(experiment_dir, 'evaluation_predictions.csv')
+    raw_data.to_csv(filepath, index=None)
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+
+
+def stratified_train_valid_split(meta_train, target_column, target_bins, valid_size, random_state=1234):
+    y = meta_train[target_column].values
+    bins = np.linspace(0, y.shape[0], target_bins)
+    y_binned = np.digitize(y, bins)
+
+    return train_test_split(meta_train, test_size=valid_size, stratify=y_binned, random_state=random_state)