-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGATree.py
150 lines (121 loc) · 4.69 KB
/
GATree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# By Jeff Yuanbo Han (u6617017), 2018-05-30.
import numpy as np
import matplotlib.pyplot as plt
from decisionTree import *
# define GA settings
DNA_SIZE = 22 # number of bits in DNA
POP_SIZE = 50 # population size
DNA_POSITIVE = 5 # max number of positive genes
CROSS_RATE = 0.8 # DNA crossover probability
MUTATION_RATE = 0.05 # mutation probability
N_GENERATIONS = 10 # generation size
depth = 6 # max depth of Decision Tree
# define objective function (accuracy of Decision Tree)
def F(featuresNo_list):
accuracy_list = []
for featuresNo in featuresNo_list:
dummyX, dummyY = vec_feature(featuresNo, train, headers)
accuracy_list.append(model(dummyX, dummyY, depth=depth)[1])
return np.array(accuracy_list)
# define non-zero fitness function for selection
def get_fitness(prediction):
return prediction + 1e-7 - np.min(prediction)
# covert binary DNA to meaningful list of feature selection numbers
def translateDNA(pop):
featuresNo_list = []
for DNA in pop:
featuresNo_list.append([i+1 for (i,j) in enumerate(DNA) if j == 1])
return featuresNo_list
# define population select function based on fitness value
# population with higher fitness value has higher chance to be selected
def select(pop, fitness):
idx = np.random.choice(np.arange(POP_SIZE), size=POP_SIZE, replace=True,
p=fitness/fitness.sum())
return pop[idx]
# define gene crossover function
def crossover(parent, pop):
if np.random.rand() < CROSS_RATE:
# randomly select another individual from population
i = np.random.randint(0, POP_SIZE, size=1)
# choose crossover points(bits)
cross_points = np.random.randint(0, 2, size=DNA_SIZE).astype(np.bool)
# produce one child
parent[cross_points] = pop[i, cross_points]
return parent
# define mutation function
def mutate(child):
for point in range(DNA_SIZE):
if np.random.rand() < MUTATION_RATE:
child[point] = 1 if child[point] == 0 else 0
return child
# define function to depress redundant positive genes
def depressDNA(child):
if sum(child) > DNA_POSITIVE:
ind = [i for (i,j) in enumerate(child) if j == 1]
np.random.shuffle(ind)
child[ind[DNA_POSITIVE:]] = 0
return child
# define initializing function
def initialize(pop_size):
pop = [1] * DNA_POSITIVE + [0] * (DNA_SIZE - DNA_POSITIVE)
np.random.shuffle(pop)
return np.array([pop] * pop_size)
def main():
print('----------DNA_POSITIVE = {}----------'.format(DNA_POSITIVE))
# initialise population DNA
pop = initialize(POP_SIZE)
# lists for plotting to show learning process
x = []
y = []
for t in range(N_GENERATIONS):
# convert binary DNA to feature selection numbers
pop_DNA = translateDNA(pop)
# compute objective function based on extracted DNA
F_values = F(pop_DNA)
# the best population so far
maximum = np.max(F_values) * 100
best_DNA = pop[np.argmax(F_values), :]
print('Generation {}'.format(t+1))
print("Most fitted DNA: {}".format(best_DNA))
print("Accuracy = {}%".format(maximum))
# record for later plotting
x.append(t+1)
y.append(maximum)
# stop in advance if Accuracy = 100%
if maximum == 100:
break
# train GA
# calculate fitness value
fitness = get_fitness(F_values)
# select better population as parent 1
pop = select(pop, fitness)
# make another copy as parent 2
pop_copy = pop.copy()
for parent in pop:
# produce a child by crossover operation
child = crossover(parent, pop_copy)
# mutate child
child = mutate(child)
# lose redundant positive genes
child = depressDNA(child)
# in case all genes are 0
if sum(child) == 0:
child = initialize(1)[0]
# replace parent with its child
parent[:] = child
# plot final Decision Tree and learning process
print('----------Final Decision Tree----------')
dummyX, dummyY, feature_names = vec_feature(translateDNA([best_DNA])[0], train, headers, get_feature_names=True)
print('Feature List:')
print(feature_names)
model(dummyX, dummyY, depth=depth, vis=True)
plt.scatter(x, y, s=100, lw=0, c='red', alpha=0.5)
plt.plot(x, y)
plt.title('{} Positive Genes'.format(DNA_POSITIVE))
plt.xlabel('Generation')
plt.ylabel('Accuracy (%)')
plt.xticks(range(1, t+2))
plt.yticks([80+i*2 for i in range(11)])
plt.show()
if __name__ == '__main__':
main()