import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
'font.family'] = 'Noto Sans KR'
plt.rcParams[
= pd.read_csv('_data/air.csv') df
계절성 고려
확률 통계
시계열 분석
SARIMA 모델
- \(SARIMA(p, d, q)(P, D, Q)_m\) 형태로 표현
- \(m\): 계절성 주기
- \(P, D, Q\): 계절성 AR, 차분, MA 차수
from statsmodels.tsa.seasonal import STL
= STL(df['Passengers'], period=12).fit() decomposition
from statsmodels.tsa.stattools import adfuller
= np.diff(df['Passengers'], n=1)
df_diff
= adfuller(df_diff)
ADF_result 0], ADF_result[1] ADF_result[
(-2.8292668241700025, 0.05421329028382508)
= np.diff(df_diff, n=12)
df_diff
= adfuller(df_diff)
ADF_result 0], ADF_result[1] ADF_result[
(-17.624862360208343, 3.823046855816035e-30)
from typing import Union
from statsmodels.tsa.statespace.sarimax import SARIMAX
def optimize_SARIMA(endog: Union[pd.Series, list], order_list: list, d: int, D: int, s: int) -> pd.DataFrame:
= []
results for order in order_list:
try:
= SARIMAX(endog,
model =(order[0], d, order[1]),
order=(order[2], D, order[3], s),
seasonal_order=False).fit(disp=False)
simple_differencingexcept:
continue
= model.aic
aic
results.append([order, aic])= pd.DataFrame(results)
result_df = ['(p, q, P, Q)', 'AIC']
result_df.columns = result_df.sort_values(by="AIC").reset_index(drop=True)
result_df
return result_df
from itertools import product
= df.iloc[:-12]['Passengers']
train = df.iloc[-12:]
test # ps = range(0, 4, 1)
# qs = range(0, 4, 1)
# Ps = range(0, 4, 1)
# Qs = range(0, 4, 1)
#
# SARIMA_order_list = list(product(ps, qs, Ps, Qs))
#
#
# d = 1
# D = 1
# s = 12
# SARIMA_result_df = optimize_SARIMA(train, SARIMA_order_list, d, D, s)
# SARIMA_result_df
= SARIMAX(train, order=(2, 1, 1), seasonal_order=(1, 1, 2, 12), simple_differencing=False)
SARIMA_model = SARIMA_model.fit(disp=False)
SARIMA_model_fit
=(10, 8))
SARIMA_model_fit.plot_diagnostics(figsize plt.show()
from statsmodels.stats.diagnostic import acorr_ljungbox
= acorr_ljungbox(SARIMA_model_fit.resid, np.arange(1, 11))
tr tr
lb_stat | lb_pvalue | |
---|---|---|
1 | 0.004785 | 0.944850 |
2 | 0.745422 | 0.688864 |
3 | 1.021040 | 0.796161 |
4 | 1.226086 | 0.873785 |
5 | 1.436408 | 0.920290 |
6 | 1.711782 | 0.944208 |
7 | 2.307234 | 0.940900 |
8 | 2.717276 | 0.950829 |
9 | 2.733486 | 0.973931 |
10 | 4.969176 | 0.893228 |
= SARIMA_model_fit.get_prediction(132, 143).predicted_mean
SARIMA_pred
'SARIMA_pred'] = SARIMA_pred
test[
=df, x=df.index, y='Passengers', label='실제 값')
sns.lineplot(data=test, x=test.index, y='SARIMA_pred', label='SARIMA 예측값')
sns.lineplot(data0, 145, 12), np.arange(1949, 1962, 1))
plt.xticks(np.arange(120, 143) plt.xlim(