Skip to content
This repository has been archived by the owner on Jun 22, 2022. It is now read-only.

Commit

Permalink
Dev solution 1 (#40)
Browse files Browse the repository at this point in the history
* dev-Solution1 (#16)

* solution 1

* Add features

* random-search (#17)

* random-search

* Update neptune.yaml

* Solution 1 - upgrade (#18)

* Remove save_eval function

* Add more features

* Refactor code - steppy 0.1.1 (#22)

* Refactor code

* Include suggested changes

* Update models.py

* Verifying submittion (#26)

* Refactor code

* Do not save submission if dev_mode

* Verifying submittion

* Add USELESS_COLUMNS

* Include remaining features

* Fix random search

* added requirements

* optimized imports

* Fix solution-1 (#38)

* Refactor code

* Fix

* small refactor

* added neptune-cli to requirements

* added steppy-toolkit to requirements

* one line clip in postprocessing (#39)

* cleaning code for realease of the solution-1

* cleaning code for the solution-1

* cleaning code for the solution-1 (again)
  • Loading branch information
Kamil A. Kaczmarek authored Jun 5, 2018
1 parent 7525aee commit 0dd4b14
Show file tree
Hide file tree
Showing 12 changed files with 1,130 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ tests/.cache
.cache/
.idea/
.idea_modules/
*_local.yaml
out/
output
output/
Expand Down
90 changes: 90 additions & 0 deletions feature_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from steppy.adapters import to_numpy_label_inputs
from steppy.base import BaseTransformer
from steppy.utils import get_logger

logger = get_logger()


class DataFrameByTypeSplitter(BaseTransformer):
def __init__(self, numerical_columns, categorical_columns, timestamp_columns):
self.numerical_columns = numerical_columns
self.categorical_columns = categorical_columns
self.timestamp_columns = timestamp_columns

def transform(self, X, y=None, **kwargs):
outputs = {}

if self.numerical_columns is not None:
outputs['numerical_features'] = X[self.numerical_columns]

if self.categorical_columns is not None:
outputs['categorical_features'] = X[self.categorical_columns]

if self.timestamp_columns is not None:
outputs['timestamp_features'] = X[self.timestamp_columns]

return outputs


class FeatureJoiner(BaseTransformer):
def transform(self, numerical_feature_list, categorical_feature_list, **kwargs):
features = numerical_feature_list + categorical_feature_list
for feature in features:
feature.reset_index(drop=True, inplace=True)
outputs = {}
outputs['features'] = pd.concat(features, axis=1).astype(np.float32)
outputs['feature_names'] = self._get_feature_names(features)
outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
return outputs

def _get_feature_names(self, dataframes):
feature_names = []
for dataframe in dataframes:
try:
feature_names.extend(list(dataframe.columns))
except Exception as e:
print(e)
feature_names.append(dataframe.name)

return feature_names


class TargetEncoder(BaseTransformer):
def __init__(self, **kwargs):
self.params = kwargs
self.encoder_class = ce.TargetEncoder

def fit(self, X, y, **kwargs):
categorical_columns = list(X.columns)
self.target_encoder = self.encoder_class(cols=categorical_columns, **self.params)
self.target_encoder.fit(X, y)
return self

def transform(self, X, y=None, **kwargs):
X_ = self.target_encoder.transform(X)
return {'categorical_features': X_}

def load(self, filepath):
self.target_encoder = joblib.load(filepath)
return self

def save(self, filepath):
joblib.dump(self.target_encoder, filepath)


class ToNumpyLabel(BaseTransformer):
def __init__(self, **kwargs):
self.y = None

def fit(self, y, **kwargs):
self.y = to_numpy_label_inputs(y)
return self

def transform(self, **kwargs):
if self.y.any():
return {'y': self.y}
return {}
155 changes: 155 additions & 0 deletions hyperparameter_tuning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import gc

import numpy as np
from deepsense import neptune
from sklearn.externals import joblib
from steppy.base import BaseTransformer

from utils import set_seed


class RandomSearchOptimizer(BaseTransformer):
def __init__(self, TransformerClass, params,
score_func, maximize,
train_input_keys, valid_input_keys,
n_runs,
callbacks=[]):
self.TransformerClass = TransformerClass
self.param_space = create_param_space(params, n_runs)
self.train_input_keys = train_input_keys
self.valid_input_keys = valid_input_keys
self.score_func = score_func
self.maximize = maximize
self.callbacks = callbacks
self.best_transformer = TransformerClass(**self.param_space[0])

def fit(self, **kwargs):
if self.train_input_keys:
train_inputs = {input_key: kwargs[input_key] for input_key in self.train_input_keys}
else:
train_inputs = kwargs
X_valid, y_valid = kwargs[self.valid_input_keys[0]], kwargs[self.valid_input_keys[1]]

results = []
for i, param_set in enumerate(self.param_space):
try:
transformer = self.TransformerClass(**param_set)
transformer.fit(**train_inputs)
except Exception:
continue
y_pred_valid = transformer.transform(X_valid)
y_pred_valid_value = list(y_pred_valid.values())[0]
run_score = self.score_func(y_valid, y_pred_valid_value)
results.append((run_score, param_set))

del y_pred_valid, transformer
gc.collect()

for callback in self.callbacks:
callback.on_run_end(score=run_score, params=param_set)

assert len(results) > 0, 'All random search runs failed, check your parameter space'
results_sorted = sorted(results, key=lambda x: x[0])

if self.maximize:
best_score, best_param_set = results_sorted[-1]
else:
best_score, best_param_set = results_sorted[0]

for callback in self.callbacks:
callback.on_search_end(results=results)

self.best_transformer = self.TransformerClass(**best_param_set)
self.best_transformer.fit(**train_inputs)
return self

def transform(self, **kwargs):
return self.best_transformer.transform(**kwargs)

def save(self, filepath):
self.best_transformer.save(filepath)

def load(self, filepath):
self.best_transformer.load(filepath)
return self


def create_param_space(params, n_runs):
seed = np.random.randint(1000)
param_space = []
for i in range(n_runs):
set_seed(seed + i)
param_choice = {}
for param, value in params.items():
if isinstance(value, list):
if len(value) == 2:
mode = 'choice'
param_choice[param] = sample_param_space(value, mode)
else:
mode = value[-1]
param_choice[param] = sample_param_space(value[:-1], mode)
else:
param_choice[param] = value
param_space.append(param_choice)
return param_space


def sample_param_space(value_range, mode):
if mode == 'list':
value = np.random.choice(value_range)
else:
range_min, range_max = value_range
if mode == 'choice':
value = np.random.choice(range(range_min, range_max, 1))
elif mode == 'uniform':
value = np.random.uniform(low=range_min, high=range_max)
elif mode == 'log-uniform':
value = np.exp(np.random.uniform(low=np.log(range_min), high=np.log(range_max)))
else:
raise NotImplementedError
return value


class GridSearchCallback:
def on_run_end(self, score, params):
return NotImplementedError

def on_search_end(self, results):
return NotImplementedError


class NeptuneMonitor(GridSearchCallback):
def __init__(self, name):
self.name = name
self.ctx = neptune.Context()
self.highest_params_channel = self._create_text_channel(name='highest params')
self.lowest_params_channel = self._create_text_channel(name='lowest params')
self.run_params_channel = self._create_text_channel(name='run params')
self.run_id = 0

def on_run_end(self, score, params):
self.ctx.channel_send('score on run', x=self.run_id, y=score)
self.run_params_channel.send(y=params)
self.run_id += 1

def on_search_end(self, results):
results_sorted = sorted(results, key=lambda x: x[0])
highest_score, highest_param_set = results_sorted[-1]
lowest_score, lowest_param_set = results_sorted[0]

self.ctx.channel_send('highest score', x=0, y=highest_score)
self.ctx.channel_send('lowest score', x=0, y=lowest_score)

self.highest_params_channel.send(y=highest_param_set)
self.lowest_params_channel.send(y=lowest_param_set)

def _create_text_channel(self, name=''):
return self.ctx.create_channel(name=name, channel_type=neptune.ChannelType.TEXT)


class SaveResults(GridSearchCallback):
def __init__(self, filepath):
self.filepath = filepath

def on_search_end(self, results):
joblib.dump(results, self.filepath)
Loading

0 comments on commit 0dd4b14

Please sign in to comment.