로지스틱 회귀

확률 통계
공개

2025년 7월 1일

\(logO(H|x) = β_0 + β_1x\)

우주 왕복선 문제

import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv')
pred = df.iloc[-1]
df = df[:-1].dropna()
offset = df['Temperature'].mean().round()
df['x'] = df['Temperature'] - offset
df['y'] = df['Damage Incident'].astype(int)

df
Date Temperature Damage Incident x y
0 04/12/1981 66 0 -4.0 0
1 11/12/1981 70 1 0.0 1
2 3/22/82 69 0 -1.0 0
4 01/11/1982 68 0 -2.0 0
5 04/04/1983 67 0 -3.0 0
6 6/18/83 72 0 2.0 0
7 8/30/83 73 0 3.0 0
8 11/28/83 70 0 0.0 0
9 02/03/1984 57 1 -13.0 1
10 04/06/1984 63 1 -7.0 1
11 8/30/84 70 1 0.0 1
12 10/05/1984 78 0 8.0 0
13 11/08/1984 67 0 -3.0 0
14 1/24/85 53 1 -17.0 1
15 04/12/1985 67 0 -3.0 0
16 4/29/85 75 0 5.0 0
17 6/17/85 70 0 0.0 0
18 7/29/85 81 0 11.0 0
19 8/27/85 76 0 6.0 0
20 10/03/1985 79 0 9.0 0
21 10/30/85 75 1 5.0 1
22 11/26/85 76 0 6.0 0
23 01/12/1986 58 1 -12.0 1

전통 로지스틱

import statsmodels.formula.api as smf

formula = 'y ~ x'
results = smf.logit(formula, data=df).fit(disp=False)
results.params
Intercept   -1.208490
x           -0.232163
dtype: float64
from scipy.special import expit
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Noto Sans KR'

inter = results.params['Intercept']
slope = results.params['x']
xs = np.arange(53, 83) - offset
ps = expit(inter + slope * xs)
plt.plot(xs + offset, ps)
plt.scatter(df['x'] + offset, df['y'])

사전 분포

from empiricaldist import Pmf

def make_joint(pmf1, pmf2):
    X, Y = np.meshgrid(pmf1, pmf2)
    return pd.DataFrame(X * Y, columns=pmf1.qs, index=pmf2.qs)

qs = np.linspace(-5, 1, 101)
prior_inter = Pmf.from_seq(qs)
qs = np.linspace(-0.8, 0.1, 101)
prior_slope = Pmf.from_seq(qs)
joint = make_joint(prior_inter, prior_slope)
joint_pmf = Pmf(joint.stack())
joint_pmf
probs
-0.8 -5.00 0.000098
-4.94 0.000098
-4.88 0.000098
-4.82 0.000098
-4.76 0.000098
... ... ...
0.1 0.76 0.000098
0.82 0.000098
0.88 0.000098
0.94 0.000098
1.00 0.000098

10201 rows × 1 columns

가능도

from scipy.stats import binom

grouped = df.groupby('x')['y'].agg(['count', 'sum'])
ns = grouped['count']
ks = grouped['sum']
xs = grouped.index
ps = expit(inter + slope * xs)
likes = binom.pmf(ks, ns, ps)
likes
array([0.93924781, 0.85931657, 0.82884484, 0.60268105, 0.56950687,
       0.24446388, 0.67790595, 0.72637895, 0.18815003, 0.8419509 ,
       0.87045398, 0.15645171, 0.86667894, 0.95545945, 0.96435859,
       0.97729671])
likelihood = joint_pmf.copy()
for slope, inter in joint_pmf.index:
    ps = expit(inter + slope * xs)
    likes = binom.pmf(ks, ns, ps)
    likelihood[slope, inter] = likes.prod()

갱신

posterior_pmf = joint_pmf * likelihood
posterior_pmf.normalize()
joint_posterior = posterior_pmf.unstack()

marginal_inter = Pmf(joint_posterior.sum(axis=0))
marginal_inter.normalize()
marginal_slope = Pmf(joint_posterior.sum(axis=1))
marginal_slope.normalize()

marginal_inter.plot()

marginal_slope.plot()

분포 변환

marginal_probs = marginal_inter.transform(expit)
marginal_lr = marginal_slope.transform(np.exp)

예측 분포

sample = posterior_pmf.choice(101)
temps = np.arange(31, 83)
xs = temps - offset
pred = np.empty((len(sample), len(xs)))
for i, (slope, inter) in enumerate(sample):
    pred[i] = expit(inter + slope * xs)
low, median, high = np.percentile(pred, [5, 50, 95], axis=0)
plt.fill_between(temps, low, high, color='C1', alpha=0.2, label='95% 신뢰구간')
plt.plot(temps, median, color='C1', label='logistic model')
plt.legend()
plt.scatter(df['x'] + offset, df['y'], label='data')

맨 위로