-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
executable file
·51 lines (40 loc) · 1.51 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
# Load the data set
df = pd.read_csv("ml_house_data_set_updated.csv")
# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']
# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])
# Remove the sale price from the feature data
del features_df['sale_price']
# Create the X and y arrays
X = features_df.as_matrix()
y = df['sale_price'].as_matrix()
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Fit regression model
model = ensemble.GradientBoostingRegressor(
n_estimators=1000,
learning_rate=0.1,
max_depth=6,
min_samples_leaf=9,
max_features=0.1,
loss='huber',
random_state=0
)
model.fit(X_train, y_train)
# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')
# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)
# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)