import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
'font.family'] = 'Noto Sans KR'
plt.rcParams[
= pd.read_csv('_data/widget.csv')
df =df, x=df.index, y='widget_sales')
sns.lineplot(data
plt.xticks(0, 30, 57, 87, 116, 145, 175, 204, 234, 264, 293, 323, 352, 382, 409, 439, 468, 498],
['Jan 2019', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan 2020', 'Feb', 'Mar', 'Apr', 'May', 'Jun'])
[ plt.show()
이동평균과정 모델링
확률 통계
시계열 분석
- 이동편균 모델: 현잿 값이 현재와 과거 오차에 선형적으로 비례한다.
from statsmodels.tsa.stattools import adfuller
= adfuller(df['widget_sales'])
ADF_result
0], ADF_result[1] ADF_result[
(-1.5121662069359054, 0.5274845352272601)
- 정상 시계열이 아님. 차분 진행
= np.diff(df['widget_sales'], n=1)
diff_df
= adfuller(diff_df)
ADF_result 0], ADF_result[1] ADF_result[
(-10.576657780341959, 7.076922818587193e-19)
from statsmodels.graphics.tsaplots import plot_acf
=30)
plot_acf(diff_df, lags plt.show()
- 지연 2 이후 유의하지 않음.
- MA(2) 진행
from sklearn.model_selection import train_test_split
= pd.DataFrame({'widget_sales_diff': diff_df})
diff_df = train_test_split(diff_df, test_size=0.1) train, test
- MA(q)는 q 크기까지만 예측 가능.
- 회귀적으로 예측을 진행해야함
from statsmodels.tsa.statespace.sarimax import SARIMAX
def rolling_forecast(df: pd.DataFrame, train_len: int, horizon: int, window: int, method: str) -> list:
= train_len + horizon
total_len if method == 'mean':
= []
pred_mean for i in range(train_len, total_len, window):
= np.mean(df[:i].values)
mean for _ in range(window))
pred_mean.extend(mean return pred_mean
if method == 'last':
= []
pred_last_value for i in range(train_len, total_len, window):
= df.iloc[i-1].values[0]
last_value for _ in range(window))
pred_last_value.extend(last_value return pred_last_value
if method == 'MA':
= []
pred_MA for i in range(train_len, total_len, window):
= SARIMAX(df[:i], order=(0,0,2))
model = model.fit(disp=False)
res = res.get_prediction(0, i + window - 1)
predictions = predictions.predicted_mean.iloc[-window:]
oos_pred
pred_MA.extend(oos_pred)return pred_MA
= test.copy()
pred_df = len(train)
TRAIN_LEN = len(test)
HORIZON = 2
WINDOW
= rolling_forecast(diff_df, TRAIN_LEN, HORIZON, WINDOW, 'mean')
pred_mean = rolling_forecast(diff_df, TRAIN_LEN, HORIZON, WINDOW, 'last')
pred_last = rolling_forecast(diff_df, TRAIN_LEN, HORIZON, WINDOW, 'MA')
pred_MA
'pred_mean'] = pred_mean
pred_df['pred_last'] = pred_last
pred_df['pred_MA'] = pred_MA pred_df[
'pred_widget_sales'] = pd.Series()
df['pred_widget_sales'].iloc[450:] = df['widget_sales'].iloc[450] + pred_df['pred_MA'].cumsum() df[
=df, x=df.index, y='widget_sales', label='실제 값')
sns.lineplot(data=df, x=df.index, y='pred_widget_sales', label='MA(2)')
sns.lineplot(data
plt.xticks(409, 439, 468, 498],
['Mar', 'Apr', 'May', 'Jun'])
[ plt.show()