-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgithub export NASDAQ 100 Forecaste.Rmd
161 lines (116 loc) · 5.63 KB
/
github export NASDAQ 100 Forecaste.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
---
title: "Forcasting Volume of NASDAQ 100 Stocks Traded"
author: "Nipunjeet Gujral"
---
```{r librraies}
library(ggplot2)
library(reshape2)
library(PerformanceAnalytics)
library(quantmod)
library(xgboost)
library(xts)
library(pROC)
```
```{r sorucing data}
nasdaq100_symbols <- c( 'AAPL', 'ADBE', 'ADI', 'ADSK', 'AKAM', 'ALTR', 'ALXN',
'AMAT', 'AMGN', 'AMZN', 'BBBY', 'BIDU', 'BIIB',
'BRCM', 'CA', 'CELG', 'CERN', 'CHKP', 'CHRW', ' CHTR', 'CMCSA',
'COST', 'CSCO', 'CTRX', 'CTXS', 'DISCA', 'DISCK', 'DISH',
'DLTR', 'DTV', 'EBAY', 'EQIX', 'ESRX', 'EXPD', 'EXPE', 'FAST',
'FB', 'FFIV', 'FISV', 'FOXA', 'GILD', 'GMCR', 'GOOG', 'GOOGL',
'GRMN', 'HSIC', 'ILMN', 'INTC', 'INTU', 'ISRG', 'KLAC', 'KRFT',
'LBTYA', 'LLTC', 'LMCA', 'LMCK', 'LVNTA', 'MAR', 'MAT', 'MDLZ',
'MNST', 'MSFT', 'MU', 'MXIM', 'MYL', 'NFLX', 'NTAP', 'NVDA',
'NXPI', 'ORLY', 'PAYX', 'PCAR', 'PCLN', 'QCOM', 'QVCA', 'REGN',
'ROST', 'SBAC', 'SBUX', 'SIAL', 'SIRI', 'SNDK', 'SPLS', 'SRCL',
'STX', 'SYMC', 'TRIP', 'TSCO', 'TSLA', 'TXN', 'VIAB', 'VIP',
'VOD', 'VRSK', 'VRTX', 'WDC', 'WFM', 'WYNN', 'XLNX', 'YHOO')
getSymbols(nasdaq100_symbols)
```
```{r creating dataframe}
nasdaq100 <- data.frame(as.xts(merge(AAPL, ADBE, ADI, ADSK, AKAM, ALTR, ALXN,
AMAT, AMGN, AMZN, BBBY, BIDU, BIIB,
BRCM, CA, CELG, CERN, CHKP, CHRW, CHTR, CMCSA,
COST, SCO, CTRX, CTXS, DISCA, DISCK, DISH,
DLTR, DTV, EBAY, EQIX, ESRX, EXPD, EXPE, FAST,
FB, FFIV, FISV, FOXA, GILD, GMCR, GOOG, GOOGL,
GRMN, HSIC, ILMN, INTC, INTU, ISRG, KLAC, KRFT,
LBTY, LLTC, LMCA, LMCK, LVNTA, MAR, MAT, MDLZ,
MNST, MSFT, MU, MXIM, MYL, NFLX, NTAP, NVDA,
NXPI, ORLY, PAYX, PCAR, PCLN, QCOM, QVCA, REGN,
ROST, SBAC, SBUX, SIAL, SIRI, SNDK, SPLS, SRCL,
STX. SYMC, TRIP, TSCO, TSLA, TXN, VIAB, VIP,
VOD, VRSK, VRTX, WDC, WFM, WYNN, XLNX, YHOO)))
```
```{r set outcome variables}
outcomeSymbol <- 'FISV.Volume'
nasdaq100 <- xts(nasdaq100,
order.by = as.Date(rownames(nasdaq100)))
# shift date back one day
nasdaq100 <- as.data.frame(merge(nasdaq100,
lml = lag(nasdaq100[, outcomeSymbol], -1)))
# if tomorrow's volume is smaller then 0, elif tomorrows value is bigger 1
nasdaq100$outcome <- ifelse(nasdaq100[, paste0(outcomeSymbol, '1')] > nasdaq100[, outsomeSymbol], 1, 0)
# remove shifted down volume field, it will be unnecessay
nasdaq100 <- nasdaq100[, !names(nasdaq100) %in% c(paste(outcomeSymbol, '1'))]
```
```{r reorder in desending}
# cast date to true date and order in decreasing order ####
GetDiffDay <- function(objDF, days = c(10), offLimitsSymbols = c('outcome'), roundByScale = 3){
# sort tdates in decreasing order
ind <- sapply(objDF, is.numeric)
for(sym in names(objDF)[ind]){
if(!sym %in% offLimitsSymbols){
print(paste('****************', sym))
objDF[, sym] <- round(scale(objDF[, sym]), roundByScale)
print(paste('theColName', sym))
for (day in days){
objDF[paste0(sym, '_', day)] <- c(diff(objDF[, sym], lag = day), rep(x = 0, day)) * -1
}
}
}
return(objDF)
}
```
```{r}
# call the above functions ####
nasdaq100 <- GetDiffDay(nasdaq100,
day = c(1,2,3,4,5,10,20),
offLimitsSymbols = c('outcome'),
roundByScale = 3)
```
```{r adjusting window}
# drop most recent entry since it doesn't have an outcome
nasdaq100 <- nasdaq100[2:nrow(nasdaq100),]
# using POSIXlt to add day of the week, month, month and year for future data points derived from model
nasdaq100$week <- as.POSIXlt(nasdaq100$date)$wday
nasdaq100$yday <- as.POSIXlt(nasdaq100$date)$mday
nasdaq100$mon <- as.POSIXlt(nasdaq100$date)$mon
# remove date field and shuffle data frame
nasdaq100 <- subset(nasdaq100, select = -c(date))
nasdaq100 <- nasdaq100[sample(nrow(nasdaq100)), ]
```
```{r predicting}
# modeling
predictorNames <- names(nasdaq100)[names(nasdaq100) != 'outcome']
set.seed(1234)
# determining train and testing data sets
split <- sample(nrow(nasdaq100), floor(0.7*nrow(nasdaq100)))
train <- nasdaq100[split, ]
test <- nasdaq100[-split, ]
bst <- sgboost(data = as.matrix(train[, predictorNames]),
label = train$outcome,
verbose = 0,
eta = 0.1,
gamma = 50,
noround = 50,
colsample_bytree = 0.1,
subsample = 8.6,
objective = "binary:logistic")
predictions <- predict(bst, as.matrix(test[, predictorNames]),
outputmargin = TRUE)
```
```{r model analysis using AUC}
auc <- roc(test$outcome, predictions)
print(paste('AUC score --> ', auc$auc))
```