-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript2.R
185 lines (124 loc) · 4.66 KB
/
script2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# this is just the testing script
# get some data
library(tidyverse)
library(readxl)
library(abind)
library(forecast)
# parameters
numGen <- 5
maxCluster <- 2
stopNow <- 100
corThreshold <- 0.2
numberGroupTests <- 1
currentGen <- 1
numFitSources<- 7
weakLinkPercent <- 0.1
# list of things in the metadata for each series
IC <- c("AIC", "AICc", "BIC", "HQ", "RSS")
measures <- c("MAPE", "MASE", "MdAPE", "MdASE")
features <- c("frequency", "trend", "seasonal", "autocorrelation",
"non-linear", "skewness",
"kurtosis", "Hurst", "Lyapunov",
"dc autocorrelation", "dc non-linear", "dc skewness",
"dc kurtosis")
meta <- c("clusterNumber", "rank", "gensSinceImprovement",
"modelChoice")
features <- c(IC, measures, features, meta)
lastSlot <- length(features)
productivity<- read_xls("data/52600550022016.xls", sheet="Table 4",
col_names = FALSE, skip=10, n_max = 17)
# the data is sideways and has difficult variable names - let's fix that!
productivity<-t(productivity) %>%
as.data.frame()
colnames(productivity)<-c("LabourHW",
"LabourQA",
"Capital",
"Mutlifactor.Productivity.Null" ,
"MultifactorHW",
"MultifactorQA")
# Getting rid of empty space and unnecessary bits
productivity <- productivity[-1,]
productivity <- productivity[,c(1:3,5:6)]
# A few things going on here:
# 1. Replacing the ABS "na" with NA for R compatibility
# 2. Read the data in as numeric
# 3. Convert to data frame
# 4. Reindex to 2013-14 year
# 5. Convert to a time series object
productivity <- productivity %>%
map(function(x) replace(x, x == "na", NA)) %>%
map(function(x) x = as.numeric(as.character(x))) %>%
as.data.frame()%>%
sapply(function(vec){return(vec/vec[31])*100},
simplify=TRUE) %>%
ts(start = 1973)
# Reindex to 2003-2004 which is row 31 in data frame
productivity <- productivity[,-c(2,5)]
y <- productivity
#Hyndman et al 2016
#fit <- baggedETS(y)
#> fcast <- forecast(fit)
#> plot(fcast)
## Generate metadata matrix
metaDataFeatures <- array(data = NA, dim = c(ncol(y),length(features),numGen))
## assess features
assessFeatures <- function(x, position){
features <- featureCalc(x)
}
for (i in 1: ncol(y)){
metaDataFeatures[i,,1] <- assessFeatures(y[,i],i)
}
## assess clusters
groups <- assessClusters(metaDataFeatures[,,1], maxCluster)
## if > 1 series in any cluster, check correlation matrix between them.
# on detrended/deseasonalised data!
# do they all start and end at the same time? this is going to be an issue.
# only check correlations between the core series.
fit <- array(NA, c(ncol(y), numFitSources, numGen))
bestFit <- array(10000, c(ncol(y), numFitSources))
for (i in 1:max(groups)){
index <- which(groups == i)
if (length(index) >= 2){
output <- assessPanels(index, y, corThreshold)
groupPanel <- output$GP
excessPanel <- output$EP
}
dataGroup <- y[,index]
## next decide how to model each series.
# Plan sample randomly a number from each cluster, estimate
# in all the ways contended, compare the MAPSE for each and decided on the best
# in each group
choiceModel <- assessInitialModel(numberGroupTests, dataGroup)
# can we use panel techniques?
# plm package
# Maybe depends on the technique
# don't worry about this right now
# ????
### Estimate - need to work out the nested data frame
output <- estimateModel(index, choiceModel, y, fit,
bestFit, currentGen,lastSlot, metaDataFeatures)
metaDataFeatures <- output$MD
bestFit <- output$BF
fit <- output$Fi
## Calculate forecasting criteria -> if we have a new best estimate, keep in best estimates
# otherwise abandon
} # end group loops
## Assess weak links and perturb
weakLinks <-assessWeakLinks(bestFit, weakLinkPercent)
for (i in weakLinks){
##test which model for each one
fitARIMA.pick <- accuracy(auto.arima(y[,i])) # this should be a functional of some kind
fitANN.pick <- accuracy(nnetar(y[,i]))
picks <- cbind(fitARIMA.pick, fitANN.pick)
bestPick <- which.min(picks[4])
if (picks[bestPick] < bestFit[index[j], 4]){
fit[index[i], ,currentGen] <- picks[bestPick]
bestFit[index[i], ] <- picks[bestPick]
}
}
# weak links in groups
# weak links across groups
# reassess weak links and update models go back to forecasting criteria
## Are we doing a manual reassessment? If so, how many series?
## Manual reassessments: show plots and current workings
# back up to clustering for the next generation. Perturb the residuals a bit??