Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
PENGZhaoqing committed Nov 29, 2016
0 parents commit 1a3eac8
Show file tree
Hide file tree
Showing 31 changed files with 1,138,632 additions and 0 deletions.
30 changes: 30 additions & 0 deletions CART/CART_Predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import collections

class Predictor:
def predict(self, data_set, data_target, mode):
clf = joblib.load('output/CART.pkl')
trained_target = clf.predict(data_set)

if mode == "validation":
print "cross validation test_target sum: " + str(collections.Counter(data_target))
elif mode == "test":
print "true test_target sum: " + str(collections.Counter(data_target))

print "\n"
print confusion_matrix(data_target, trained_target, labels=[0, 1, 2, 3, 4])
print "\n"
print classification_report(data_target, trained_target)
print "\n"

# for row in range(0, 4):
# if sum(matrix[row, :][0:-1]) > 0:
# matrix[row][5] = matrix[row][row] / sum(matrix[row, :][0:-1])

# for col in range(0, 4):
# if sum(matrix[:, col][0:-1]) > 0:
# matrix[5][col] = matrix[col][col] / sum(matrix[:, col][0:-1])
#
# print np.around(matrix, 3)
Binary file added CART/CART_Predictor.pyc
Binary file not shown.
33 changes: 33 additions & 0 deletions CART/CART_Runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from Mongo_Con import DB_manager
from CART_Trainer import Trainer
from CART_Predictor import Predictor
from Variable import attr_list


class Runner:
trainer = Trainer()
predictor = Predictor()

def train_and_validate(self, dataset, datatarget, T_len):
dataset = self.trainer.label_encoding(dataset)
# feature_set, fea_index = self.trainer.feature_selection(data_set, self.db.attr_list)
data_set = dataset[0:(T_len - 1)]
data_target = datatarget[0:(T_len - 1)]
feature_set, fea_index = self.trainer.tree_based_selection(data_set, data_target, attr_list)
training_set, training_target, test_set, test_target = self.trainer.corss_validation_filter(feature_set,
data_target)
self.trainer.train(training_set, training_target, fea_index)
self.predictor.predict(test_set, test_target, mode="validation")
return dataset, datatarget, fea_index

def predict(self, data_set, data_target, fea_index, T_len):
test_data_set = data_set[T_len:len(dataset)]
test_data_target = data_target[T_len:len(dataset)]
feature_set = test_data_set[:, fea_index]
self.predictor.predict(feature_set, test_data_target, mode="test")


runner = Runner()
dataset, datatarget, T_len = DB_manager().CART_fetch_data()
dataset, datatarget, fea_index = runner.train_and_validate(dataset, datatarget, T_len)
runner.predict(dataset, datatarget, fea_index, T_len)
130 changes: 130 additions & 0 deletions CART/CART_Trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from sklearn.externals import joblib
import random
from sklearn import tree
from sklearn import preprocessing
import numpy as np
import pydotplus
from Mongo_Con import DB_manager
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from Variable import attr_list


class Trainer:
db = DB_manager.client

def train(self, training_set, training_target, fea_index):

clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=5)
clf = clf.fit(training_set, training_target)

class_names = np.unique([str(i) for i in training_target])
feature_names = [attr_list[i] for i in fea_index]

dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=feature_names,
class_names=class_names,
filled=True, rounded=True,
special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("output/tree-vis.pdf")
joblib.dump(clf, 'output/CART.pkl')

def feature_selection(self, data_set, feature_names):
"""
:param data_set:
:return:
"""
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
feature_set = sel.fit_transform(data_set)

fea_index = []
for A_col in np.arange(data_set.shape[1]):
for B_col in np.arange(feature_set.shape[1]):
if (data_set[:, A_col] == feature_set[:, B_col]).all():
fea_index.append(A_col)

check = {}
for i in fea_index:
check[feature_names[i]] = data_set[0][i]
print np.array(check)

return feature_set, fea_index

def tree_based_selection(self, data_set, data_target, feature_names):
"""
:param data_set:
:return:
"""

clf = ExtraTreesClassifier()
clf = clf.fit(data_set, data_target)
print clf.feature_importances_

model = SelectFromModel(clf, prefit=True)
feature_set = model.transform(data_set)

fea_index = []
for A_col in np.arange(data_set.shape[1]):
for B_col in np.arange(feature_set.shape[1]):
if (data_set[:, A_col] == feature_set[:, B_col]).all():
fea_index.append(A_col)

check = {}
for i in fea_index:
check[feature_names[i]] = data_set[0][i]
print np.array(check)

return feature_set, fea_index

def label_encoding(self, dataset):
"""
:param data_set:
:param data_target:
:return: data_set, data_target
"""

le_1 = preprocessing.LabelEncoder()
le_2 = preprocessing.LabelEncoder()
le_3 = preprocessing.LabelEncoder()

le_1.fit(np.unique(dataset[:, 1]))
le_2.fit(np.unique(dataset[:, 2]))
le_3.fit(np.unique(dataset[:, 3]))

dataset[:, 1] = le_1.transform(dataset[:, 1])
dataset[:, 2] = le_2.transform(dataset[:, 2])
dataset[:, 3] = le_3.transform(dataset[:, 3])

return dataset

def corss_validation_filter(self, data_set, data_target, factor=0.1):
"""
:param data_set:
:param data_target:
:return: training_set, training_target, test_set, test_target
"""
test_index = random.sample(range(0, len(data_target) - 1), int(len(data_target) * factor))
training_index = list(set(range(0, len(data_target) - 1)) - set(test_index))

training_set = data_set[training_index]
training_target = data_target[training_index]

test_set = data_set[test_index]
test_target = data_target[test_index]

print "\n"
print "training_set: " + str(training_set.shape)
print "training_target: " + str(training_target.shape)

print "test_set: " + str(test_set.shape)
print "test_target: " + str(test_target.shape)
print "\n"

return training_set, training_target, test_set, test_target
Binary file added CART/CART_Trainer.pyc
Binary file not shown.
Empty file added CART/__init__.py
Empty file.
Binary file added CART/output/CART.pkl
Binary file not shown.
Binary file added CART/output/tree-vis.pdf
Binary file not shown.
13 changes: 13 additions & 0 deletions MLP/MLP_Predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

class Predictor:
def predict(self, test_set, test_target):
clf = joblib.load('output/MLP.pkl')
trained_target = clf.predict(test_set)

print confusion_matrix(test_target, trained_target, labels=[0, 1, 2, 3, 4])
print classification_report(test_target, trained_target)


Binary file added MLP/MLP_Predictor.pyc
Binary file not shown.
21 changes: 21 additions & 0 deletions MLP/MLP_Runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from Mongo_Con import DB_manager
from MLP_Trainer import Trainer
from MLP_Predictor import Predictor


class MLP_Runner:
trainer = Trainer()
predictor = Predictor()

def data_load(self):
dataset, datatarget, T_len = self.db.MLP_fetch_data()
return dataset, datatarget, T_len

def train(self, dataset, datatarget, T_len):
data_set, data_target, test_set, test_target = self.trainer.one_hot_encoding(dataset, datatarget, T_len)
self.trainer.train(data_set, data_target)
self.predictor.predict(test_set, test_target)

runner = MLP_Runner()
dataset, datatarget, T_len = DB_manager().MLP_fetch_data()
runner.train(dataset, datatarget, T_len)
111 changes: 111 additions & 0 deletions MLP/MLP_Trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from sklearn.externals import joblib
import random
from sklearn.feature_extraction import DictVectorizer
import collections
import numpy as np
from Mongo_Con import DB_manager
from sklearn.feature_selection import VarianceThreshold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from Variable import attr_list


class Trainer:
db = DB_manager.client

def train(self, training_set, training_target):
clf = MLPClassifier(solver='adam', alpha=1e-5,
hidden_layer_sizes=(10, 6), random_state=1)
clf.fit(training_set, training_target)
joblib.dump(clf, 'output/MLP.pkl')

def feature_selection(self, data_set):
"""
:param data_set:
:return:
"""

sel = VarianceThreshold(threshold=(.5 * (1 - .5)))
feature_set = sel.fit_transform(data_set)

fea_index = []
for A_col in np.arange(data_set.shape[1]):
for B_col in np.arange(feature_set.shape[1]):
if (data_set[:, A_col] == feature_set[:, B_col]).all():
fea_index.append(A_col)

check = {}
for i in fea_index:
check[attr_list[i]] = data_set[0][i]
print check

return data_set

def one_hot_encoding(self, dataset, datatarget, T_len):
"""
:param data_set:
:param data_target:
:return: data_set, data_target
"""

vec = DictVectorizer()
dataset = vec.fit_transform(dataset).toarray()

print dataset.shape

data_set = dataset[0:(T_len - 1)]
data_target = datatarget[0:(T_len - 1)]

test_set = dataset[T_len:len(dataset)]
test_target = datatarget[T_len:len(dataset)]

pca = PCA(n_components=20)
pca.fit(data_set)

print data_set.shape

data_set = pca.transform(data_set)
test_set = pca.transform(test_set)

scaler = StandardScaler()
scaler.fit(data_set)

print(pca.explained_variance_ratio_)

data_set = scaler.transform(data_set)
test_set = scaler.transform(test_set)

print collections.Counter(test_target)

return data_set, data_target, test_set, test_target

def corss_validation_filter(self, data_set, data_target, factor=0.1):
"""
:param data_set:
:param data_target:
:return: training_set, training_target, test_set, test_target
"""
test_index = random.sample(range(0, len(data_target) - 1), int(len(data_target) * factor))
training_index = list(set(range(0, len(data_target) - 1)) - set(test_index))

training_set = data_set[training_index]
training_target = data_target[training_index]

test_set = data_set[test_index]
test_target = data_target[test_index]

print "training_set: " + str(training_set.shape)
print "training_target: " + str(training_target.shape)

print "test_set: " + str(test_set.shape)
print "test_target: " + str(test_target.shape)

counter = collections.Counter(training_target)
print counter

return training_set, training_target, test_set, test_target
Binary file added MLP/MLP_Trainer.pyc
Binary file not shown.
Empty file added MLP/__init__.py
Empty file.
Binary file added MLP/output/MLP.pkl
Binary file not shown.
Binary file added MLP/output/decision-tree.pkl
Binary file not shown.
Loading

0 comments on commit 1a3eac8

Please sign in to comment.