
\(logO(H|x) = β_0 + β_1x\)
우주 왕복선 문제
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv')
pred = df.iloc[-1]
df = df[:-1].dropna()
offset = df['Temperature'].mean().round()
df['x'] = df['Temperature'] - offset
df['y'] = df['Damage Incident'].astype(int)
df
0 |
04/12/1981 |
66 |
0 |
-4.0 |
0 |
1 |
11/12/1981 |
70 |
1 |
0.0 |
1 |
2 |
3/22/82 |
69 |
0 |
-1.0 |
0 |
4 |
01/11/1982 |
68 |
0 |
-2.0 |
0 |
5 |
04/04/1983 |
67 |
0 |
-3.0 |
0 |
6 |
6/18/83 |
72 |
0 |
2.0 |
0 |
7 |
8/30/83 |
73 |
0 |
3.0 |
0 |
8 |
11/28/83 |
70 |
0 |
0.0 |
0 |
9 |
02/03/1984 |
57 |
1 |
-13.0 |
1 |
10 |
04/06/1984 |
63 |
1 |
-7.0 |
1 |
11 |
8/30/84 |
70 |
1 |
0.0 |
1 |
12 |
10/05/1984 |
78 |
0 |
8.0 |
0 |
13 |
11/08/1984 |
67 |
0 |
-3.0 |
0 |
14 |
1/24/85 |
53 |
1 |
-17.0 |
1 |
15 |
04/12/1985 |
67 |
0 |
-3.0 |
0 |
16 |
4/29/85 |
75 |
0 |
5.0 |
0 |
17 |
6/17/85 |
70 |
0 |
0.0 |
0 |
18 |
7/29/85 |
81 |
0 |
11.0 |
0 |
19 |
8/27/85 |
76 |
0 |
6.0 |
0 |
20 |
10/03/1985 |
79 |
0 |
9.0 |
0 |
21 |
10/30/85 |
75 |
1 |
5.0 |
1 |
22 |
11/26/85 |
76 |
0 |
6.0 |
0 |
23 |
01/12/1986 |
58 |
1 |
-12.0 |
1 |
전통 로지스틱
import statsmodels.formula.api as smf
formula = 'y ~ x'
results = smf.logit(formula, data=df).fit(disp=False)
results.params
Intercept -1.208490
x -0.232163
dtype: float64
from scipy.special import expit
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Noto Sans KR'
inter = results.params['Intercept']
slope = results.params['x']
xs = np.arange(53, 83) - offset
ps = expit(inter + slope * xs)
plt.plot(xs + offset, ps)
plt.scatter(df['x'] + offset, df['y'])
사전 분포
from empiricaldist import Pmf
def make_joint(pmf1, pmf2):
X, Y = np.meshgrid(pmf1, pmf2)
return pd.DataFrame(X * Y, columns=pmf1.qs, index=pmf2.qs)
qs = np.linspace(-5, 1, 101)
prior_inter = Pmf.from_seq(qs)
qs = np.linspace(-0.8, 0.1, 101)
prior_slope = Pmf.from_seq(qs)
joint = make_joint(prior_inter, prior_slope)
joint_pmf = Pmf(joint.stack())
joint_pmf
-0.8 |
-5.00 |
0.000098 |
-4.94 |
0.000098 |
-4.88 |
0.000098 |
-4.82 |
0.000098 |
-4.76 |
0.000098 |
... |
... |
... |
0.1 |
0.76 |
0.000098 |
0.82 |
0.000098 |
0.88 |
0.000098 |
0.94 |
0.000098 |
1.00 |
0.000098 |
10201 rows × 1 columns
가능도
from scipy.stats import binom
grouped = df.groupby('x')['y'].agg(['count', 'sum'])
ns = grouped['count']
ks = grouped['sum']
xs = grouped.index
ps = expit(inter + slope * xs)
likes = binom.pmf(ks, ns, ps)
likes
array([0.93924781, 0.85931657, 0.82884484, 0.60268105, 0.56950687,
0.24446388, 0.67790595, 0.72637895, 0.18815003, 0.8419509 ,
0.87045398, 0.15645171, 0.86667894, 0.95545945, 0.96435859,
0.97729671])
likelihood = joint_pmf.copy()
for slope, inter in joint_pmf.index:
ps = expit(inter + slope * xs)
likes = binom.pmf(ks, ns, ps)
likelihood[slope, inter] = likes.prod()
갱신
posterior_pmf = joint_pmf * likelihood
posterior_pmf.normalize()
joint_posterior = posterior_pmf.unstack()
marginal_inter = Pmf(joint_posterior.sum(axis=0))
marginal_inter.normalize()
marginal_slope = Pmf(joint_posterior.sum(axis=1))
marginal_slope.normalize()
marginal_inter.plot()
분포 변환
marginal_probs = marginal_inter.transform(expit)
marginal_lr = marginal_slope.transform(np.exp)
예측 분포
sample = posterior_pmf.choice(101)
temps = np.arange(31, 83)
xs = temps - offset
pred = np.empty((len(sample), len(xs)))
for i, (slope, inter) in enumerate(sample):
pred[i] = expit(inter + slope * xs)
low, median, high = np.percentile(pred, [5, 50, 95], axis=0)
plt.fill_between(temps, low, high, color='C1', alpha=0.2, label='95% 신뢰구간')
plt.plot(temps, median, color='C1', label='logistic model')
plt.legend()
plt.scatter(df['x'] + offset, df['y'], label='data')
맨 위로