This repository has been archived by the owner on Jun 22, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 172
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* dev-Solution1 (#16) * solution 1 * Add features * random-search (#17) * random-search * Update neptune.yaml * Solution 1 - upgrade (#18) * Remove save_eval function * Add more features * Refactor code - steppy 0.1.1 (#22) * Refactor code * Include suggested changes * Update models.py * Verifying submittion (#26) * Refactor code * Do not save submission if dev_mode * Verifying submittion * Add USELESS_COLUMNS * Include remaining features * Fix random search * added requirements * optimized imports * Fix solution-1 (#38) * Refactor code * Fix * small refactor * added neptune-cli to requirements * added steppy-toolkit to requirements * one line clip in postprocessing (#39) * cleaning code for realease of the solution-1 * cleaning code for the solution-1 * cleaning code for the solution-1 (again)
- Loading branch information
Kamil A. Kaczmarek
authored
Jun 5, 2018
1 parent
7525aee
commit 0dd4b14
Showing
12 changed files
with
1,130 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ tests/.cache | |
.cache/ | ||
.idea/ | ||
.idea_modules/ | ||
*_local.yaml | ||
out/ | ||
output | ||
output/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import category_encoders as ce | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.externals import joblib | ||
from steppy.adapters import to_numpy_label_inputs | ||
from steppy.base import BaseTransformer | ||
from steppy.utils import get_logger | ||
|
||
logger = get_logger() | ||
|
||
|
||
class DataFrameByTypeSplitter(BaseTransformer): | ||
def __init__(self, numerical_columns, categorical_columns, timestamp_columns): | ||
self.numerical_columns = numerical_columns | ||
self.categorical_columns = categorical_columns | ||
self.timestamp_columns = timestamp_columns | ||
|
||
def transform(self, X, y=None, **kwargs): | ||
outputs = {} | ||
|
||
if self.numerical_columns is not None: | ||
outputs['numerical_features'] = X[self.numerical_columns] | ||
|
||
if self.categorical_columns is not None: | ||
outputs['categorical_features'] = X[self.categorical_columns] | ||
|
||
if self.timestamp_columns is not None: | ||
outputs['timestamp_features'] = X[self.timestamp_columns] | ||
|
||
return outputs | ||
|
||
|
||
class FeatureJoiner(BaseTransformer): | ||
def transform(self, numerical_feature_list, categorical_feature_list, **kwargs): | ||
features = numerical_feature_list + categorical_feature_list | ||
for feature in features: | ||
feature.reset_index(drop=True, inplace=True) | ||
outputs = {} | ||
outputs['features'] = pd.concat(features, axis=1).astype(np.float32) | ||
outputs['feature_names'] = self._get_feature_names(features) | ||
outputs['categorical_features'] = self._get_feature_names(categorical_feature_list) | ||
return outputs | ||
|
||
def _get_feature_names(self, dataframes): | ||
feature_names = [] | ||
for dataframe in dataframes: | ||
try: | ||
feature_names.extend(list(dataframe.columns)) | ||
except Exception as e: | ||
print(e) | ||
feature_names.append(dataframe.name) | ||
|
||
return feature_names | ||
|
||
|
||
class TargetEncoder(BaseTransformer): | ||
def __init__(self, **kwargs): | ||
self.params = kwargs | ||
self.encoder_class = ce.TargetEncoder | ||
|
||
def fit(self, X, y, **kwargs): | ||
categorical_columns = list(X.columns) | ||
self.target_encoder = self.encoder_class(cols=categorical_columns, **self.params) | ||
self.target_encoder.fit(X, y) | ||
return self | ||
|
||
def transform(self, X, y=None, **kwargs): | ||
X_ = self.target_encoder.transform(X) | ||
return {'categorical_features': X_} | ||
|
||
def load(self, filepath): | ||
self.target_encoder = joblib.load(filepath) | ||
return self | ||
|
||
def save(self, filepath): | ||
joblib.dump(self.target_encoder, filepath) | ||
|
||
|
||
class ToNumpyLabel(BaseTransformer): | ||
def __init__(self, **kwargs): | ||
self.y = None | ||
|
||
def fit(self, y, **kwargs): | ||
self.y = to_numpy_label_inputs(y) | ||
return self | ||
|
||
def transform(self, **kwargs): | ||
if self.y.any(): | ||
return {'y': self.y} | ||
return {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import gc | ||
|
||
import numpy as np | ||
from deepsense import neptune | ||
from sklearn.externals import joblib | ||
from steppy.base import BaseTransformer | ||
|
||
from utils import set_seed | ||
|
||
|
||
class RandomSearchOptimizer(BaseTransformer): | ||
def __init__(self, TransformerClass, params, | ||
score_func, maximize, | ||
train_input_keys, valid_input_keys, | ||
n_runs, | ||
callbacks=[]): | ||
self.TransformerClass = TransformerClass | ||
self.param_space = create_param_space(params, n_runs) | ||
self.train_input_keys = train_input_keys | ||
self.valid_input_keys = valid_input_keys | ||
self.score_func = score_func | ||
self.maximize = maximize | ||
self.callbacks = callbacks | ||
self.best_transformer = TransformerClass(**self.param_space[0]) | ||
|
||
def fit(self, **kwargs): | ||
if self.train_input_keys: | ||
train_inputs = {input_key: kwargs[input_key] for input_key in self.train_input_keys} | ||
else: | ||
train_inputs = kwargs | ||
X_valid, y_valid = kwargs[self.valid_input_keys[0]], kwargs[self.valid_input_keys[1]] | ||
|
||
results = [] | ||
for i, param_set in enumerate(self.param_space): | ||
try: | ||
transformer = self.TransformerClass(**param_set) | ||
transformer.fit(**train_inputs) | ||
except Exception: | ||
continue | ||
y_pred_valid = transformer.transform(X_valid) | ||
y_pred_valid_value = list(y_pred_valid.values())[0] | ||
run_score = self.score_func(y_valid, y_pred_valid_value) | ||
results.append((run_score, param_set)) | ||
|
||
del y_pred_valid, transformer | ||
gc.collect() | ||
|
||
for callback in self.callbacks: | ||
callback.on_run_end(score=run_score, params=param_set) | ||
|
||
assert len(results) > 0, 'All random search runs failed, check your parameter space' | ||
results_sorted = sorted(results, key=lambda x: x[0]) | ||
|
||
if self.maximize: | ||
best_score, best_param_set = results_sorted[-1] | ||
else: | ||
best_score, best_param_set = results_sorted[0] | ||
|
||
for callback in self.callbacks: | ||
callback.on_search_end(results=results) | ||
|
||
self.best_transformer = self.TransformerClass(**best_param_set) | ||
self.best_transformer.fit(**train_inputs) | ||
return self | ||
|
||
def transform(self, **kwargs): | ||
return self.best_transformer.transform(**kwargs) | ||
|
||
def save(self, filepath): | ||
self.best_transformer.save(filepath) | ||
|
||
def load(self, filepath): | ||
self.best_transformer.load(filepath) | ||
return self | ||
|
||
|
||
def create_param_space(params, n_runs): | ||
seed = np.random.randint(1000) | ||
param_space = [] | ||
for i in range(n_runs): | ||
set_seed(seed + i) | ||
param_choice = {} | ||
for param, value in params.items(): | ||
if isinstance(value, list): | ||
if len(value) == 2: | ||
mode = 'choice' | ||
param_choice[param] = sample_param_space(value, mode) | ||
else: | ||
mode = value[-1] | ||
param_choice[param] = sample_param_space(value[:-1], mode) | ||
else: | ||
param_choice[param] = value | ||
param_space.append(param_choice) | ||
return param_space | ||
|
||
|
||
def sample_param_space(value_range, mode): | ||
if mode == 'list': | ||
value = np.random.choice(value_range) | ||
else: | ||
range_min, range_max = value_range | ||
if mode == 'choice': | ||
value = np.random.choice(range(range_min, range_max, 1)) | ||
elif mode == 'uniform': | ||
value = np.random.uniform(low=range_min, high=range_max) | ||
elif mode == 'log-uniform': | ||
value = np.exp(np.random.uniform(low=np.log(range_min), high=np.log(range_max))) | ||
else: | ||
raise NotImplementedError | ||
return value | ||
|
||
|
||
class GridSearchCallback: | ||
def on_run_end(self, score, params): | ||
return NotImplementedError | ||
|
||
def on_search_end(self, results): | ||
return NotImplementedError | ||
|
||
|
||
class NeptuneMonitor(GridSearchCallback): | ||
def __init__(self, name): | ||
self.name = name | ||
self.ctx = neptune.Context() | ||
self.highest_params_channel = self._create_text_channel(name='highest params') | ||
self.lowest_params_channel = self._create_text_channel(name='lowest params') | ||
self.run_params_channel = self._create_text_channel(name='run params') | ||
self.run_id = 0 | ||
|
||
def on_run_end(self, score, params): | ||
self.ctx.channel_send('score on run', x=self.run_id, y=score) | ||
self.run_params_channel.send(y=params) | ||
self.run_id += 1 | ||
|
||
def on_search_end(self, results): | ||
results_sorted = sorted(results, key=lambda x: x[0]) | ||
highest_score, highest_param_set = results_sorted[-1] | ||
lowest_score, lowest_param_set = results_sorted[0] | ||
|
||
self.ctx.channel_send('highest score', x=0, y=highest_score) | ||
self.ctx.channel_send('lowest score', x=0, y=lowest_score) | ||
|
||
self.highest_params_channel.send(y=highest_param_set) | ||
self.lowest_params_channel.send(y=lowest_param_set) | ||
|
||
def _create_text_channel(self, name=''): | ||
return self.ctx.create_channel(name=name, channel_type=neptune.ChannelType.TEXT) | ||
|
||
|
||
class SaveResults(GridSearchCallback): | ||
def __init__(self, filepath): | ||
self.filepath = filepath | ||
|
||
def on_search_end(self, results): | ||
joblib.dump(results, self.filepath) |
Oops, something went wrong.