-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInsurance_data_analysis.R
225 lines (185 loc) · 8.32 KB
/
Insurance_data_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# Install packages we need
#install.packages("readr") #Read Rectangular Text Data
#install.packages("dplyr") #A Grammar of Data Manipulation
#install.packages("rpart")
#install.packages("rpart.plot")
#install.packages("partykit")
#install.packages("arules")
#install.packages("arulesViz")
# Call out the installed packages
library(readr)
library(dplyr)
library(rpart)
library(rpart.plot)
library(partykit)
library(arules)
library(arulesViz)
# Read data
insurance <- read_csv("C:/Users/ian19_000/Downloads/01.2011_Census_Microdata.csv")
#str(insurance)
# Original name of data
#[1] "Person ID" "Region" "Residence Type" "Family Composition"
#[5] "Population Base" "Sex" "Age" "Marital Status"
#[9] "Student" "Country of Birth" "Health" "Ethnic Group"
#[13] "Religion" "Economic Activity" "Occupation" "Industry"
#[17] "Hours worked per week" "Approximated Social Grade"
# Rename the variables
names = c("PersonID", "Region", "ResidenceType", "FamilyComposition", "PopulationBase", "Sex",
"Age", "MaritalStatus", "Student", "CountryOfBirth","Health", "EthnicGroup",
"Religion", "EconomicActivity", "Occupation", "Industry", "HoursWorked", "SocialGrade")
colnames(insurance) = names
# Discard Health = -9
insurance <- insurance %>% filter(Health != -9)
#str(insurance)
# Change variables into factor
insurance <- insurance %>% select(-PersonID, -ResidenceType, - PopulationBase) %>%
mutate_all(funs(as.factor(.)))
#str(insurance)
### Decision Tree with loss matrix =======================================================
# Define loss matrix
lossmatrix_first = matrix(c( 0, 1, 1, 1, 1,
2, 0, 2, 2, 2,
6, 6, 0, 6, 6,
24, 24, 24, 0, 24,
120,120,120,120, 0), byrow = T, nrow = 5)
lossmatrix_second = matrix(c( 0, 1, 2, 6,24,
1, 0, 1, 2, 6,
2, 1, 0, 1, 2,
6, 2, 1, 0, 1,
24, 6, 2, 1, 0), byrow = T, nrow = 5)
lossmatrix = lossmatrix_first * t(lossmatrix_second)
# Spilt data into training set and testing set
set.seed(1234)
idx_fold <- sample(1:5, nrow(insurance), replace = T)
idc_train <- idx_fold != 5
# Use loss matrix to construct Decision Tree model
rpart_fit <- rpart(Health ~ ., data = insurance, subset = idc_train, method = "class",
parms = list(loss = lossmatrix), cp = -1)
# Print the result of Decision Tree model
#sink("Insurance_Tree_model_full.log")
#rpart_fit
#sink()
# Plot model (method 1)
#prp(rpart_fit, faclen = 0, fallen.leaves = T, extra = 2)
# Plot model (method 2)
#party_fit <- as.party(rpart_fit)
#plot(party_fit)
# Find the predicted class
pred_train <- predict(rpart_fit, insurance[idc_train, ], type = "class")
pred_test <- predict(rpart_fit, insurance[!idc_train, ], type = "class")
# See the prediction
table_train = table(real = insurance[idc_train, ]$Health, predict = pred_train)
table_train
# predict
#real 1 2 3 4 5
#1 74480 86745 40252 8218 2294
#2 15385 79885 34922 17068 5994
#3 973 10622 26458 14624 6995
#4 19 292 3258 12751 3438
#5 0 12 131 1045 4538
table_test = table(real = insurance[!idc_train, ]$Health, predict = pred_test)
table_test
# predict
#real 1 2 3 4 5
# 1 15606 23662 10529 2485 700
# 2 6342 16355 9663 4539 1591
# 3 868 3535 4399 4062 1944
# 4 119 594 1160 1838 1089
# 5 41 125 294 592 406
# Calculate the classification error
error_train = 1 - sum(diag(table_train)) / sum(table_train)
error_train
#[1] 0.5601411
error_test = 1 - sum(diag(table_test)) / sum(table_test)
error_test
#[1] 0.6569692
# Find the best CP under 10-fold CV
cp_matrix = printcp(rpart_fit)
cp_matrix = cp_matrix[c(-1,-2),]
cp_best = cp_matrix[which.min(cp_matrix[,"xerror"]), "CP"]
# Use the best CP to construct model
prune_fit <- prune(rpart_fit, cp = cp_best)
# Print the result of model after pruning
#sink("Insurance_Tree_model_prune.log")
#prune_fit
#sink()
# Call variable importance
prune_fit$variable.importance
# EconomicActivity Age MaritalStatus Occupation Industry SocialGrade
# 55125.1060 37957.9889 11544.6842 3325.5436 3245.3986 2575.8693
#FamilyComposition Religion Region HoursWorked EthnicGroup Student
# 1853.0231 1777.5423 1757.8407 1338.4296 650.0216 255.0316
# Sex CountryOfBirth
# 235.3054 197.4269
# Find the predicted class
pred_prune_test <- predict(prune_fit, insurance[!idc_train, ], type = "class")
# See the prediction
table_prune_test = table(real = insurance[!idc_train, ]$Health, predict = pred_prune_test)
table_prune_test
# predict
#real 1 2 3 4 5
# 1 415 30853 19306 2307 101
# 2 84 14535 17011 6442 418
# 3 10 1949 5211 6529 1109
# 4 2 251 936 2529 1082
# 5 0 59 198 786 415
# Calculate the classification error
error_prune_test = 1 - sum(diag(table_prune_test)) / sum(table_prune_test)
error_prune_test
#[1] 0.7946916
# Print the evaulation result
#sink("Insurance_Tree_result.log")
#lossmatrix
#table_test;table_prune_test
#error_test;error_prune_test
#prune_fit$variable.importance
#sink()
# Save the algorithm with data fitted
#save.image(file = "Insurance_Tree.RData")
### Association Rules ================================================================
# Change into transaction format
insurance_trans <- as(insurance, "transactions")
# See the summary of data
summary(insurance_trans)
sort(itemFrequency(insurance_trans), decreasing = T)
# Find the rules with support >= 100/562937 and confidence >= 0.05
rules <- apriori(insurance_trans, parameter = list(maxlen = 5,
support = 100/562937,
confidence = 0.05))
#summary(rules)
#plot(rules, measure = c("confidence", "lift"), shading = "support")
#plot(rules, method = "grouped", control = list(gp_labels = gpar(cex = 0.3)))
# Choose the rules with rhs contains "Health=5" and lift > 1
rulesOwn <- subset(rules, subset = rhs %pin% "Health=5" & lift > 1)
#summary(rulesOwn)
# See the rules with top 10 largest lift
rulesOwn_sort = sort(rulesOwn, by = "lift")
inspect(rulesOwn_sort[1:10])
# lhs rhs
#[1] {Sex=1,MaritalStatus=2,EconomicActivity=8,SocialGrade=3} => {Health=5}
#[2] {Age=7,EconomicActivity=8} => {Health=5}
#[3] {Age=7,EconomicActivity=8,HoursWorked=-9} => {Health=5}
#[4] {Age=7,Student=2,EconomicActivity=8} => {Health=5}
#[5] {Age=7,Student=2,EconomicActivity=8,HoursWorked=-9} => {Health=5}
#[6] {Age=7,EthnicGroup=1,EconomicActivity=8} => {Health=5}
#[7] {Age=7,EthnicGroup=1,EconomicActivity=8,HoursWorked=-9} => {Health=5}
#[8] {Age=7,Student=2,EthnicGroup=1,EconomicActivity=8} => {Health=5}
#[9] {Age=7,CountryOfBirth=1,EthnicGroup=1,EconomicActivity=8} => {Health=5}
#[10] {Age=7,CountryOfBirth=1,EconomicActivity=8} => {Health=5}
# support confidence lift count
#[1] 0.0001794162 0.2121849 16.62677 101
#[2] 0.0002273789 0.2067851 16.20365 128
#[3] 0.0002273789 0.2067851 16.20365 128
#[4] 0.0002273789 0.2067851 16.20365 128
#[5] 0.0002273789 0.2067851 16.20365 128
#[6] 0.0001989565 0.2036364 15.95691 112
#[7] 0.0001989565 0.2036364 15.95691 112
#[8] 0.0001989565 0.2036364 15.95691 112
#[9] 0.0001811926 0.2000000 15.67197 102
#[10]0.0001811926 0.1976744 15.48973 102
# Call out the rules
#sink("Insurance_ARules_rules.log")
#inspect(sort(rulesOwn, by = "lift"))
#sink()
# Save the algorithm with data fitted
# save.image(file = "Insurance_ARules.RData")