pmdarima中的pipeline和autoarima
程序员文章站
2023-12-30 14:44:52
...
最近一段时间在学习时间序列,其中用到了pdmarima这个库,做了一些简单的笔记。
在传统的时间序列分析中,我们需要对数据进行平稳化处理(包括差分、取对数)还要进行白噪声检验,定阶后才能进行时间序列分析预测等等,而使用pmdarima函数可以直接对原始数据进行处理,可以说是BUG级别的存在,但是关于哪种方法的准确性更好,我还没有得到结论。
这里只介绍其中两个函数pipeline和autoarima
import numpy as np
import pmdarima as pm
from pmdarima import pipeline
from pmdarima import model_selection
from pmdarima import preprocessing as ppc
from pmdarima import arima
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
print("pmdarima version: %s" % pm.__version__) # 输出pmdarima版本
# pmdarima version: 1.8.3
# Load the data and split it into separate pieces
data = pm.datasets.load_wineind() # 0-175
train, test = model_selection.train_test_split(data, train_size=150) # 176*0.85
# Let's create a pipeline with multiple stages... the Wineind dataset is
# seasonal, so we'll include a FourierFeaturizer so we can fit it without
# seasonality
pipe = pipeline.Pipeline([ # 这个吊函数是干嘛的 pipeline
("fourier", ppc.FourierFeaturizer(m=12, k=4)),
("arima", arima.AutoARIMA(stepwise=True, trace=1, error_action="ignore",
seasonal=False, # because we use Fourier
suppress_warnings=True))
])
pipe.fit(train)
'''
Performing stepwise search to minimize aic
ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=2819.938, Time=0.44 sec
ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=2942.625, Time=0.03 sec
ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=2867.514, Time=0.05 sec
ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=2830.585, Time=0.25 sec
ARIMA(0,1,0)(0,0,0)[0] : AIC=2940.651, Time=0.08 sec
ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=2817.535, Time=0.31 sec
ARIMA(0,1,2)(0,0,0)[0] intercept : AIC=2814.904, Time=0.28 sec
ARIMA(0,1,3)(0,0,0)[0] intercept : AIC=2818.704, Time=0.42 sec
ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=2817.377, Time=0.27 sec
ARIMA(1,1,3)(0,0,0)[0] intercept : AIC=inf, Time=0.49 sec
ARIMA(0,1,2)(0,0,0)[0] : AIC=2815.283, Time=0.16 sec
Best model: ARIMA(0,1,2)(0,0,0)[0] intercept
Total fit time: 2.785 seconds
'''
print("Model fit:")
print(pipe)
'''Pipeline(steps=[('fourier', FourierFeaturizer(k=4, m=12)),
('arima',
AutoARIMA(error_action='ignore', seasonal=False, trace=1))])
'''
# We can compute predictions the same way we would on a normal ARIMA object:
preds, conf_int = pipe.predict(n_periods=10, return_conf_int=True) # n_periods 决定了预测的个数preds
print("\nForecasts:")
print(preds)
'''
Forecasts:
[28518.72113507 29963.35395259 25827.01547833 25060.73664694
34235.75497953 33509.00343109 21083.11245407 19764.81581617
25895.72238967 25433.9826346 ]
[26536.03461603 34421.8718367 33695.11312236 21269.22053055
19950.92875656 26081.83484715 25620.09236979 24414.18423212
26098.77243124 28871.52923078 30770.54774092]
'''
# Let's take a look at the actual vs. the predicted values:
fig, axes = plt.subplots(3, 1, figsize=(12, 8))
fig.tight_layout() # 自动调整子图参数 是指填充整个图像区域
# Visualize goodness of fit 形象化拟合优度 什么基把玩意
'''in_sample_preds, in_sample_confint = \ # 只是一个换行的功能
pipe.predict_in_sample(X=None, return_conf_int=True)'''
in_sample_preds, in_sample_confint = pipe.predict_in_sample(X=None, return_conf_int=True) # in_simple 什么基把玩意
# in_sample_preds是一个 一维数组ndarray -5472.51410 1258.96212 11673.36285
# in_sample_confit 是一个二维数组
n_train = train.shape[0] # int 150
x0 = np.arange(n_train) # 0-149
axes[0].plot(x0, train, alpha=0.75) # 原始值
axes[0].scatter(x0, in_sample_preds, alpha=0.4, marker='x') # pipe的预测值 用点状图表示
axes[0].fill_between(x0,
in_sample_confint[:, 0], # 淡蓝色的区域 预测值范围 上限
in_sample_confint[:, 1], # 淡蓝色的区域 预测值范围 下限
alpha=0.1,
color='b'
)
axes[0].set_title('Actual train samples vs. in-sample predictions')
axes[0].set_xlim((0, x0.shape[0]))
# Visualize actual + predicted
x1 = np.arange(n_train + preds.shape[0]) # 150+10
axes[1].plot(x1[:n_train], train, alpha=0.75) # 原始数据150个
axes[1].scatter(x1[n_train:], preds, alpha=0.4, marker='o')
axes[1].scatter(x1[n_train:], test[:preds.shape[0]], alpha=0.4, marker='x') # 还是原始数据test 一共26个 截取test的0-9
axes[1].fill_between(x1[n_train:], # 横坐标
conf_int[:, 0], # 淡紫色区域下限
conf_int[:, 1], # 淡紫色区域上限
alpha=0.1,
color='b'
)
axes[1].set_title('Actual test samples vs. forecasts')
axes[1].set_xlim((0, data.shape[0]))
# We can also call `update` directly on the pipeline object, which will update
# the intermittent transformers, where necessary: # 参考arima的update 相似
newly_observed, still_test = test[:15], test[15:] # 再取15个原始数据记为newly_observed 前者为test取前15个0-14 后者为15-25
pipe.update(newly_observed, maxiter=10)
# Calling predict will now predict from newly observed values
# print('--------------', still_test.shape[0]) -------------- 11
new_preds = pipe.predict(still_test.shape[0]) # 预测 剩下的个数11
print(new_preds)
'''[26536.03461603 34421.8718367 33695.11312236 21269.22053055
19950.92875656 26081.83484715 25620.09236979 24414.18423212
26098.77243124 28871.52923078 30770.54774092]'''
x2 = np.arange(data.shape[0])
n_trained_on = n_train + newly_observed.shape[0] # 150+15
axes[2].plot(x2[:n_train], train, alpha=0.75) # 原始数据
axes[2].plot(x2[n_train: n_trained_on], newly_observed, alpha=0.75, c='orange') # newly_observed也是原始数据啊 新加的15个原始数据
# n_train = 150 n_train_on = 165 newly_observed(15个原始数据)
axes[2].scatter(x2[n_trained_on:], new_preds, alpha=0.4, marker='o') # 预测值
axes[2].scatter(x2[n_trained_on:], still_test, alpha=0.4, marker='x') # 剩下的原始数据
# still_test 11个原始数据 28496 32867...
axes[2].set_title('Actual test samples vs. forecasts')
axes[2].set_xlim((0, data.shape[0]))
plt.show()
import pmdarima as pm
from pmdarima import model_selection
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
# #############################################################################
# Load the data and split it into separate pieces
data = pm.datasets.load_lynx()
train, test = model_selection.train_test_split(data, train_size=100) # 确定训练集、测试集这个函数可以记住model_selection.train_test_spilt 用来分割train和test
# Fit a simple auto_arima model 使用训练集的数据来拟合模型
modl = pm.auto_arima(train,
start_p=1, # p最小值
start_q=1, # q最小值
start_P=1,
start_Q=1,
max_p=5, # p最大值
max_q=5, # q最大值
max_P=5,
max_Q=5,
seasonal=True,
stepwise=True, # 当stepwise为False是则不进行完全组合遍历
suppress_warnings=True, D=10, max_D=10,
error_action='ignore'
)
# model = ARIMA(2,0,0)
# Create predictions for the future, evaluate on test
preds, conf_int = modl.predict(n_periods=test.shape[0], return_conf_int=True)# conf_int是啥含义
# 有一说一 这个预测值真的看不下去 明显不对
# Print the error:
print("Test RMSE: %.3f" % np.sqrt(mean_squared_error(test, preds))) # RMSE均方根误差 test是原始数据 preds是预测值 真实值与估计值
# #############################################################################
# Plot the points and the forecasts
x_axis = np.arange(train.shape[0] + preds.shape[0]) # 0-123
x_years = x_axis + 1821 # Year starts at 1821
plt.plot(x_years[x_axis[:train.shape[0]]], train, alpha=0.75) # 原始数据 蓝色
plt.plot(x_years[x_axis[train.shape[0]:]], preds, alpha=0.75) # Forecasts 橙色
plt.scatter(x_years[x_axis[train.shape[0]:]], test,
alpha=0.4, marker='x') # Test data 用散点图表示
plt.fill_between(x_years[x_axis[-preds.shape[0]:]],
conf_int[:, 0], conf_int[:, 1], # 这里盲猜 conf_int代表预测值最大和最小范围区间 也就是淡紫色区域
alpha=0.1, color='b')
plt.title("Lynx forecasts")
plt.xlabel("Year")
plt.show()
推荐阅读
-
pmdarima中的pipeline和autoarima
-
tensorflow中的Fetch和Feed
-
php学习之道:php中is_file和file_exist的区别,and判断目录is_dir
-
转帖:一分钟教会你用google图表中的曲线图和柱状图
-
Ecshop安装过程中的的相关问题:cls_image::gd_version()和不支持JPEG
-
分享下个人关于js中的同步和异步的理解
-
请大家帮我看一下我这段代码中的ul和a为什么不能显示在同一行中。_html/css_WEB-ITnose
-
PHP中exec函数和shell_exec函数的区别,execshell_exec_PHP教程
-
PHP中的按位与和按位或操作示例_PHP教程
-
python中多进程和进程池(Processing库)的实例代码