-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
284 lines (238 loc) · 10.6 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
class Model(object):
def __init__(self, df_tweet=None, df_company_tweet=None, df_company=None):
self.tweet = df_tweet
self.company_tweet = df_company_tweet
self.company = df_company
# define the sentiment extracting method
def get_sentiment(self):
self.tweet["post_date"] = pd.to_datetime(self.tweet["post_date"], unit="s")
self.tweet = self.tweet.merge(
self.company_tweet, left_on="tweet_id", right_on="tweet_id"
)
tsla = self.tweet.loc[tweet["ticker_symbol"] == "TSLA", :].reset_index()
# create SentimentIntensityAnalyzer object and perform the sentiment analysis
sia = SentimentIntensityAnalyzer()
tsla["sentiment_scores"] = tsla["body"].apply(
lambda tweet: sia.polarity_scores(tweet)
)
tsla["compound"] = tsla["sentiment_scores"].apply(
lambda score_dict: score_dict["compound"]
)
tsla["sentiment"] = tsla["compound"].apply(
lambda c: "positive"
if c >= 0.05
else ("neutral" if c > -0.05 and c < 0.05 else "negative")
)
tsla["post_date"] = tsla["post_date"].dt.date
# compute the mean of compounded intraday data
tsla_sentiment_daily = tsla.groupby("post_date").agg(score=("compound", "mean"))
return tsla_sentiment_daily
# train the model for tsla and test
def tsla_train(self, tsla_daily=None):
tsla_price_daily = tsla_daily
start_date = "2015-01-01"
end_date = "2019-12-31"
tsla_price_daily = tsla_price_daily.loc[start_date:end_date]
tsla_price_daily = tsla_price_daily.merge(
tsla_sentiment_daily, left_index=True, right_index=True
)
# prepare the tensor needed for the LSTM training process
n_timesteps = 30
features_set = []
labels = []
for i in range(n_timesteps, tsla_price_daily.shape[0]):
features_set.append(tsla_price_daily.iloc[(i - n_timesteps) : i, :])
labels.append(tsla_price_daily.iloc[i, 3])
features_set, labels = np.array(features_set), np.array(labels)
# split the train & test dataset
n = features_set.shape[0]
n_train = int(n * 0.80)
n_test = n - n_train
# print("n_train = ",str(n_train))
# print("n_test = ",str(n_test))
features_set_train, features_set_test = (
features_set[0:n_train, :, :],
features_set[n_train:n, :, :],
)
labels_train, labels_test = labels[0:n_train], labels[n_train:n]
# data preprocessing and do the scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
# Reshape to 2D
features_set_train_2D = features_set_train.reshape(
-1, features_set_train.shape[-1]
)
features_set_test_2D = features_set_test.reshape(
-1, features_set_test.shape[-1]
)
# Reshape labels to be 2D
labels_train_2D = labels_train.reshape(-1, 1)
labels_test_2D = labels_test.reshape(-1, 1)
# Fit the two scalers
scaler_X.fit(features_set_train_2D)
scaler_y.fit(labels_train_2D)
# Do the transformation
features_set_train_scaled_2D = scaler_X.transform(features_set_train_2D)
features_set_test_scaled_2D = scaler_X.transform(features_set_test_2D)
features_set_train_scaled = features_set_train_scaled_2D.reshape(
features_set_train.shape
)
features_set_test_scaled = features_set_test_scaled_2D.reshape(
features_set_test.shape
)
# Transform the labels and reshape them back
labels_train_scaled_2D = scaler_y.transform(labels_train_2D)
labels_test_scaled_2D = scaler_y.transform(labels_test_2D)
labels_train_scaled = labels_train_scaled_2D.reshape(-1)
labels_test_scaled = labels_test_scaled_2D.reshape(-1)
# Set the model
n_features = 7
model_lstm = models.Sequential()
model_lstm.add(
LSTM(
512,
return_sequences=False,
activation="tanh",
input_shape=(n_timesteps, n_features),
)
)
model_lstm.add(Dropout(0.4))
model_lstm.add(layers.Dense(100, activation="linear"))
model_lstm.add(layers.Dense(1, activation="relu"))
# train the model
nepochs = 25
model_lstm.compile(optimizer="adam", loss="mse")
history = model_lstm.fit(
features_set_train_scaled,
labels_train_scaled,
epochs=nepochs,
batch_size=128,
validation_data=(features_set_test_scaled, labels_test_scaled),
)
return model_lstm, history
# create the plot method
def plot(self, history_lstm=None):
# define all the plot functions
plt.clf() # clear figure
history_lstm = history.history
train_mse = history_lstm["loss"]
test_mse = history_lstm["val_loss"]
min_test_mse = min(test_mse)
plt.figure(figsize=(16, 6))
plt.plot(
range(1, nepochs + 1), train_mse, "b", label="train MSE", color="blue"
)
plt.plot(
range(1, nepochs + 1), test_mse, "b", label="test MSE", color="red"
)
# plt.ylim((0, 0.0001))
plt.title(
"LSTM: min(test MSE) = " + str(round(min(test_mse), 6)), fontsize=16
)
plt.xlabel("Epochs", fontsize=16)
plt.ylabel("MSE", fontsize=16)
plt.legend(loc="upper right", fontsize=16)
# plt.ylim(0,0.0015)
plt.show()
# do the plot of the prediction
train = tsla_price_daily.iloc[30 : (30 + len(labels_train)), :]
y_hat_train = model_lstm.predict(features_set_train_scaled)
y_hat_train = scaler_y.inverse_transform(y_hat_train)
train["prediction"] = y_hat_train
valid = tsla_price_daily.iloc[(30 + len(labels_train)) :, :]
y_hat_valid = model_lstm.predict(features_set_test_scaled)
y_hat_valid = scaler_y.inverse_transform(y_hat_valid)
valid["prediction"] = y_hat_valid
plt.style.use("seaborn")
plt.figure(figsize=(16, 6))
plt.plot(train["Close"], label="Train")
plt.plot(valid[["Close", "prediction"]], label=["Validation", "Prediction"])
plt.title(
"Tesla With Public Opinion (min(mse) = {:.4f})".format(min_test_mse)
)
plt.legend()
plt.show()
return None
# train the model for etfs
def etfs_train(self, path=None):
models = []
for path in paths:
price_daily = pd.read_csv(path, parse_dates=["Date"], index_col="Date")
regex = r".*/(.*).csv"
company = re.findall(regex, path)[0]
# prepare the tensor needed for the LSTM training process
n_timesteps = 30
features_set = []
labels = []
for i in range(n_timesteps, price_daily.shape[0]):
features_set.append(price_daily.iloc[(i - n_timesteps) : i, :])
labels.append(price_daily.iloc[i, 3])
features_set, labels = np.array(features_set), np.array(labels)
# split the train & test dataset
n = features_set.shape[0]
n_train = int(n * 0.80)
n_test = n - n_train
# print("n_train = ",str(n_train))
# print("n_test = ",str(n_test))
features_set_train, features_set_test = (
features_set[0:n_train, :, :],
features_set[n_train:n, :, :],
)
labels_train, labels_test = labels[0:n_train], labels[n_train:n]
# data preprocessing and do the scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
# Reshape to 2D
features_set_train_2D = features_set_train.reshape(
-1, features_set_train.shape[-1]
)
features_set_test_2D = features_set_test.reshape(
-1, features_set_test.shape[-1]
)
# Reshape labels to be 2D
labels_train_2D = labels_train.reshape(-1, 1)
labels_test_2D = labels_test.reshape(-1, 1)
# Fit the two scalers
scaler_X.fit(features_set_train_2D)
scaler_y.fit(labels_train_2D)
# Do the transformation
features_set_train_scaled_2D = scaler_X.transform(features_set_train_2D)
features_set_test_scaled_2D = scaler_X.transform(features_set_test_2D)
features_set_train_scaled = features_set_train_scaled_2D.reshape(
features_set_train.shape
)
features_set_test_scaled = features_set_test_scaled_2D.reshape(
features_set_test.shape
)
# Transform the labels and reshape them back
labels_train_scaled_2D = scaler_y.transform(labels_train_2D)
labels_test_scaled_2D = scaler_y.transform(labels_test_2D)
labels_train_scaled = labels_train_scaled_2D.reshape(-1)
labels_test_scaled = labels_test_scaled_2D.reshape(-1)
# Set the model
n_features = 6
model_lstm = models.Sequential()
model_lstm.add(
LSTM(
512,
return_sequences=False,
activation="tanh",
input_shape=(n_timesteps, n_features),
)
)
model_lstm.add(layers.Dense(100, activation="linear"))
model_lstm.add(Dropout(0.4))
model_lstm.add(layers.Dense(1, activation="relu"))
# train the model
nepochs = 15
model_lstm.compile(optimizer="adam", loss="mse")
history = model_lstm.fit(
features_set_train_scaled,
labels_train_scaled,
epochs=nepochs,
batch_size=128,
validation_data=(features_set_test_scaled, labels_test_scaled),
)
models.append(model_lstm)
self.plot(history_lstm=history)
return models