-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathfeature_selection_bidirectional.py
58 lines (43 loc) · 1.56 KB
/
feature_selection_bidirectional.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""Bidirectional Feature Selection using an sklearn estimator."""
"""
Settings for this recipe:
TARGET_COLUMN: Column name of target variable
ESTIMATOR: Base sklearn estimator
K_FEATURES: Number of final features to select
SCORING: Scoring metric
CV: Number of cross-validation folds
More details available here: http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector
P.S. Categorical inputs need to be converted to numeric before running feature selection.
"""
import datatable as dt
import numpy as np
import pandas as pd
from h2oaicore.data import CustomData
import typing
from sklearn.linear_model import LogisticRegression
# Please edit these before usage (default values are for credit card dataset)
TARGET_COLUMN = 'default payment next month'
ESTIMATOR = LogisticRegression()
K_FEATURES = 10
SCORING = 'accuracy'
CV = 5
class BidirectionalFeatureSelection(CustomData):
_modules_needed_by_name = ["mlxtend"]
@staticmethod
def create_data(X: dt.Frame = None) -> pd.DataFrame:
if X is None:
return []
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
X = X.to_pandas()
y = X[TARGET_COLUMN].values
X.drop(TARGET_COLUMN, axis=1, inplace=True)
sfs = SFS(ESTIMATOR,
k_features=K_FEATURES,
forward=True,
floating=True,
scoring=SCORING,
cv=CV,
n_jobs=-1)
sfs.fit(X, y)
X_fs = X.iloc[:, list(sfs.k_feature_idx_)]
return X_fs