-
Notifications
You must be signed in to change notification settings - Fork 0
/
ARIMA-RF.py
312 lines (255 loc) · 9.33 KB
/
ARIMA-RF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from pylab import mpl
from statsmodels.graphics.gofplots import qqplot
plt.rcParams['axes.unicode_minus']=False
mpl.rcParams['axes.unicode_minus']=False
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.sans-serif'] = ['SimHei']
# 数据
dates = ['2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-05', '2022-06', '2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12', '2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06', '2023-07', '2023-08', '2023-09', '2023-10', '2023-11', '2023-12']
prices = [230, 230, 240, 250, 240, 215, 210, 205, 205, 205, 200, 200, 200, 230, 250, 220, 210, 180, 170, 185, 205, 200, 200, 200, 195, 200, 200, 220, 250, 260, 240, 235, 210, 200, 200, 200]
# 将数据转换成pandas DataFrame
data = pd.DataFrame({'Date': dates, 'Price': prices})
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
# 绘制时序图
plt.figure(figsize=(12, 6))
plt.plot(data['Price'], marker='o')
plt.title('长江成品油运价时序图')
plt.xlabel('日期')
plt.ylabel('价格')
plt.grid(True)
plt.show()
# 进行ADF检验
result = adfuller(data['Price'])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
# 判断数据的稳定性
if result[0] < result[4]["5%"]:
print("数据是平稳的")
else:
print("数据是非平稳的")
# 对Price进行一阶差分
data['Price_Diff'] = data['Price'].diff()
# 去除NaN值
data_diff = data.dropna()
# 绘制一阶差分后的时序图
plt.figure(figsize=(12, 6))
plt.plot(data_diff['Price_Diff'], marker='o')
plt.title('长江成品油运价一阶差分时序图')
plt.xlabel('日期')
plt.ylabel('价格差分')
plt.grid(True)
plt.show()
# 对一阶差分序列进行ADF检验
result_diff = adfuller(data_diff['Price_Diff'])
print('ADF Statistic for Differenced Series: %f' % result_diff[0])
print('p-value: %f' % result_diff[1])
print('Critical Values:')
for key, value in result_diff[4].items():
print('\t%s: %.3f' % (key, value))
# 判断差分序列的稳定性
if result_diff[0] < result_diff[4]["5%"]:
print("差分序列是平稳的")
else:
print("差分序列是非平稳的")
# 绘制ACF图
plt.figure(figsize=(12, 6))
plot_acf(data_diff['Price_Diff'], lags=20)
plt.title('一阶差分序列的自相关系数 (ACF)')
plt.show()
# 绘制PACF图
plt.figure(figsize=(12, 6))
plot_pacf(data_diff['Price_Diff'], lags=16)
plt.title('一阶差分序列的偏自相关系数 (PACF)')
plt.show()
# 使用ARIMA(2,1,2)模型进行拟合
model = ARIMA(data['Price'], order=(2, 1, 2))
model_fit = model.fit()
# 进行预测
data['Forecast'] = model_fit.predict(start=0, end=len(data) - 1, typ='levels')
data['Forecast'][0] = data['Price'][0]
print('打印:', data['Forecast'].values)
y_pred_arima_list = data['Forecast'].tolist()
y_pred_arima_list = [float(i) for i in y_pred_arima_list]
print(type(data['Forecast']))
# 计算残差
data['Residual'] = data['Price'] - data['Forecast']
# 绘制预测数据的时序图
plt.figure(figsize=(12, 6))
plt.plot(data['Price'], label='Actual')
plt.plot(data['Forecast'], label='Forecast')
plt.title('实际数据与ARIMA(2,1,2)预测数据的时序图')
plt.xlabel('日期')
plt.ylabel('价格')
plt.legend()
plt.grid(True)
plt.show()
# 绘制残差序列的时序图
plt.figure(figsize=(12, 6))
plt.plot(data['Residual'], label='Residual')
plt.title('残差序列的时序图')
plt.xlabel('日期')
plt.ylabel('残差')
plt.legend()
plt.grid(True)
plt.show()
# 绘制残差序列的ACF图
plt.figure(figsize=(12, 6))
plot_acf(data['Residual'].dropna(), lags=20)
plt.title('残差序列的自相关系数 (ACF)')
plt.show()
# 绘制残差序列的PACF图
plt.figure(figsize=(12, 6))
plot_pacf(data['Residual'].dropna(), lags=17)
plt.title('残差序列的偏自相关系数 (PACF)')
plt.show()
# 绘制残差序列的QQ图
plt.figure(figsize=(8, 8))
qqplot(data['Residual'].dropna(), line='s')
plt.title('残差序列的QQ图')
plt.show()
# 打印实际数据、预测数据、对应误差的表格
print(data[['Price', 'Forecast', 'Residual']])
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
# 生成模拟影响因子数据
np.random.seed(0)
factor1 = np.random.normal(0, 1, 36)
factor2 = np.random.normal(0, 1, 36)
# 创建DataFrame
data = pd.DataFrame({'Date': dates, 'Price': prices, 'Factor1': factor1, 'Factor2': factor2})
# 分割数据集
X = data[['Factor1', 'Factor2']]
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 构建随机森林模型
rf = RandomForestRegressor(random_state=0)
# 设置网格搜索的参数范围
param_grid = {
'n_estimators': [10, 50, 100],
'max_depth': [None, 10, 20, 30],
}
# 进行网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)
# 使用最佳参数的模型进行预测
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X)
print('打印:', y_pred)
y_pred_rf_list = y_pred.tolist()
y_pred_rf_list = [float(i) for i in y_pred_rf_list]
# 计算MSE, RMSE, MAPE
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y, y_pred)
# 打印评价指标
print(f'MSE: {mse}, RMSE: {rmse}, MAPE: {mape}')
# 绘制时序图
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], y, label='Actual')
plt.plot(data['Date'], y_pred, label='RF_Predicted')
plt.title('实际与预测价格的时序图')
plt.xlabel('日期')
plt.ylabel('价格')
plt.legend()
plt.grid(True)
plt.xticks(rotation=90)
plt.show()
print(type(y_pred))
combined_pred = []
# 计算组合预测结果
for i, j in zip(y_pred_arima_list, y_pred_rf_list):
combined_data = 0.88 * i + 0.12 * j
combined_pred.append(combined_data)
date_list = data['Date'].tolist()
# 绘制时序图
plt.figure(figsize=(12, 6))
plt.plot(date_list, y, label='Actual')
plt.plot(date_list, combined_pred, label='Combined Predicted')
plt.title('实际数据与组合预测结果的时序图')
plt.xlabel('日期')
plt.ylabel('价格')
plt.legend()
plt.grid(True)
plt.show()
# 计算评价指标
mse_combined = mean_squared_error(y, combined_pred)
rmse_combined = np.sqrt(mse_combined)
mape_combined = mean_absolute_percentage_error(y, combined_pred)
# 打印评价指标
print(f'Combined Model MSE: {mse_combined}, RMSE: {rmse_combined}, MAPE: {mape_combined}')
# y: 实际数据
# y_pred_arima: ARIMA模型的预测结果
# y_pred_rf: RF模型的预测结果
# combined_pred: ARIMA-RF组合预测结果
# 绘制时序图
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], y, label='Actual', color='blue')
plt.plot(data['Date'], y_pred_arima_list, label='ARIMA Predicted', color='red')
plt.plot(data['Date'], y_pred_rf_list, label='RF Predicted', color='green')
plt.plot(data['Date'], combined_pred, label='Combined Predicted', color='purple')
plt.title('实际数据与各模型预测结果的时序图')
plt.xlabel('日期')
plt.ylabel('价格')
plt.legend()
plt.grid(True)
plt.xticks(rotation=90)
plt.show()
# 计算评价指标
mse_arima = mean_squared_error(y, y_pred_arima_list)
rmse_arima = np.sqrt(mse_arima)
mape_arima = mean_absolute_percentage_error(y, y_pred_arima_list)
mse_rf = mean_squared_error(y, y_pred_rf_list)
rmse_rf = np.sqrt(mse_rf)
mape_rf = mean_absolute_percentage_error(y, y_pred_rf_list)
mse_combined = mean_squared_error(y, combined_pred)
rmse_combined = np.sqrt(mse_combined)
mape_combined = mean_absolute_percentage_error(y, combined_pred)
# 创建评估指标表格
metrics_df = pd.DataFrame({
'Model': ['ARIMA', 'RF', 'ARIMA-RF Combined'],
'MSE': [mse_arima, mse_rf, mse_combined],
'RMSE': [rmse_arima, rmse_rf, rmse_combined],
'MAPE': [mape_arima, mape_rf, mape_combined]
})
# 打印表格
print(metrics_df)
import pandas as pd
from pandas.tseries.offsets import MonthEnd
# 假设已有的最后一个日期是2023-12
last_date = pd.to_datetime('2023-12')
# 生成后一年的日期列表
future_dates = [last_date + MonthEnd(n) for n in range(1, 13)]
future_dates_df = pd.DataFrame({'Date': future_dates})
# 预测后一年的数据
future_arima_pred = model_fit.forecast(steps=12) # ARIMA模型预测后一年
future_rf_pred = np.array([np.mean(y_pred_rf_list[-12:]) for _ in range(12)]) # 使用RF模型最后12个月的平均值作为简单预测
# 组合模型的预测
future_combined_pred = 0.88 * future_arima_pred + 0.12 * future_rf_pred
# 创建预测数据的DataFrame
future_pred_df = pd.DataFrame({
'Date': future_dates,
'ARIMA_Pred': future_arima_pred,
'RF_Pred': future_rf_pred,
'Combined_Pred': future_combined_pred
})
# 绘制未来一年的预测数据
plt.figure(figsize=(12, 6))
plt.plot(future_dates_df['Date'], future_pred_df['Combined_Pred'], label='Combined Predicted')
plt.title('后一年的价格预测')
plt.xlabel('日期')
plt.ylabel('价格')
plt.legend()
plt.grid(True)
plt.xticks(rotation=90)
plt.show()