-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_forest.py
90 lines (67 loc) · 3.77 KB
/
random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
## Random Forest Classifier Using Scikit-Learn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import time
# Load dataset
data = pd.read_csv('song_data.csv') # reading data from csv file
# Features and target
x = data.drop(columns=['genre'])
x = x.apply(pd.to_numeric, errors='coerce')
x = x.fillna(x.mean()) # filling missing values with mean
y = data['genre'].values
# Standardize features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=50) # splitting data into training and testing data
param_grid = { # defining parameters for grid and random search
'n_estimators': [10, 25, 50, 100, 200],
'max_features': ['sqrt', 'log2', None],
'max_depth': [3, 6, 9, None],
'max_leaf_nodes': [3, 6, 9, None],
}
#Train Random Forest
model = RandomForestClassifier()
start_time = time.time()
model.fit(x_train, y_train)
end_time = time.time()
y_pred = model.predict(x_test) # without parameter optimization
print(f"Training + testing runtime without parameter optimization: {end_time - start_time:.2f}s")
# performance evaluation metrics
print(classification_report(y_pred, y_test)) # avg % 87 precision
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5) # grid search with 5 fold cross validation
start_time = time.time()
grid_search.fit(x_train, y_train)
end_time = time.time()
print(grid_search.best_estimator_) # best estimators
print(f"Time taken for grid search: {end_time - start_time:.2f}s") # time taken for grid search
model_grid = RandomForestClassifier(max_depth=grid_search.best_params_['max_depth'],
max_features=grid_search.best_params_['max_features'],
max_leaf_nodes=grid_search.best_params_['max_leaf_nodes'],
n_estimators=grid_search.best_params_['n_estimators'])
start_time = time.time()
model_grid.fit(x_train, y_train)
y_pred_grid = model_grid.predict(x_test)
end_time = time.time()
print(classification_report(y_pred_grid, y_test)) # model performance with grid search
print(f"Training + testing runtime: {end_time - start_time:.2f} seconds") # training runtime
random_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5) # random search with 5 fold cross validation
start_time = time.time()
random_search.fit(x_train, y_train)
end_time = time.time()
print(random_search.best_estimator_) # best estimators
print(f"Time taken for random search: {end_time - start_time:.2f}s") # time taken for random search
model_grid = RandomForestClassifier(max_depth=random_search.best_params_['max_depth'],
max_features=random_search.best_params_['max_features'],
max_leaf_nodes=random_search.best_params_['max_leaf_nodes'],
n_estimators=random_search.best_params_['n_estimators'])
start_time = time.time()
model_grid.fit(x_train, y_train)
y_pred_grid = model_grid.predict(x_test)
end_time = time.time()
print(classification_report(y_pred_grid, y_test)) # model performance with random search
print(f"Training + testing runtime: {end_time - start_time:.2f} seconds") # training runtime