로지스틱 회귀

확률 통계

공개

2025년 7월 1일

\(logO(H|x) = β_0 + β_1x\)

우주 왕복선 문제

import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv')
pred = df.iloc[-1]
df = df[:-1].dropna()
offset = df['Temperature'].mean().round()
df['x'] = df['Temperature'] - offset
df['y'] = df['Damage Incident'].astype(int)

df

	Date	Temperature	Damage Incident	x	y
0	04/12/1981	66	0	-4.0	0
1	11/12/1981	70	1	0.0	1
2	3/22/82	69	0	-1.0	0
4	01/11/1982	68	0	-2.0	0
5	04/04/1983	67	0	-3.0	0
6	6/18/83	72	0	2.0	0
7	8/30/83	73	0	3.0	0
8	11/28/83	70	0	0.0	0
9	02/03/1984	57	1	-13.0	1
10	04/06/1984	63	1	-7.0	1
11	8/30/84	70	1	0.0	1
12	10/05/1984	78	0	8.0	0
13	11/08/1984	67	0	-3.0	0
14	1/24/85	53	1	-17.0	1
15	04/12/1985	67	0	-3.0	0
16	4/29/85	75	0	5.0	0
17	6/17/85	70	0	0.0	0
18	7/29/85	81	0	11.0	0
19	8/27/85	76	0	6.0	0
20	10/03/1985	79	0	9.0	0
21	10/30/85	75	1	5.0	1
22	11/26/85	76	0	6.0	0
23	01/12/1986	58	1	-12.0	1

전통 로지스틱

import statsmodels.formula.api as smf

formula = 'y ~ x'
results = smf.logit(formula, data=df).fit(disp=False)
results.params

Intercept   -1.208490
x           -0.232163
dtype: float64

from scipy.special import expit
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Noto Sans KR'

inter = results.params['Intercept']
slope = results.params['x']
xs = np.arange(53, 83) - offset
ps = expit(inter + slope * xs)
plt.plot(xs + offset, ps)
plt.scatter(df['x'] + offset, df['y'])

사전 분포

from empiricaldist import Pmf

def make_joint(pmf1, pmf2):
    X, Y = np.meshgrid(pmf1, pmf2)
    return pd.DataFrame(X * Y, columns=pmf1.qs, index=pmf2.qs)

qs = np.linspace(-5, 1, 101)
prior_inter = Pmf.from_seq(qs)
qs = np.linspace(-0.8, 0.1, 101)
prior_slope = Pmf.from_seq(qs)
joint = make_joint(prior_inter, prior_slope)
joint_pmf = Pmf(joint.stack())
joint_pmf

		probs
-0.8	-5.00	0.000098
	-4.94	0.000098
	-4.88	0.000098
	-4.82	0.000098
	-4.76	0.000098
...	...	...
0.1	0.76	0.000098
	0.82	0.000098
	0.88	0.000098
	0.94	0.000098
	1.00	0.000098

10201 rows × 1 columns

가능도

from scipy.stats import binom

grouped = df.groupby('x')['y'].agg(['count', 'sum'])
ns = grouped['count']
ks = grouped['sum']
xs = grouped.index
ps = expit(inter + slope * xs)
likes = binom.pmf(ks, ns, ps)
likes

array([0.93924781, 0.85931657, 0.82884484, 0.60268105, 0.56950687,
       0.24446388, 0.67790595, 0.72637895, 0.18815003, 0.8419509 ,
       0.87045398, 0.15645171, 0.86667894, 0.95545945, 0.96435859,
       0.97729671])

likelihood = joint_pmf.copy()
for slope, inter in joint_pmf.index:
    ps = expit(inter + slope * xs)
    likes = binom.pmf(ks, ns, ps)
    likelihood[slope, inter] = likes.prod()

갱신

posterior_pmf = joint_pmf * likelihood
posterior_pmf.normalize()
joint_posterior = posterior_pmf.unstack()

marginal_inter = Pmf(joint_posterior.sum(axis=0))
marginal_inter.normalize()
marginal_slope = Pmf(joint_posterior.sum(axis=1))
marginal_slope.normalize()

marginal_inter.plot()

marginal_slope.plot()

분포 변환

marginal_probs = marginal_inter.transform(expit)
marginal_lr = marginal_slope.transform(np.exp)

예측 분포

sample = posterior_pmf.choice(101)
temps = np.arange(31, 83)
xs = temps - offset
pred = np.empty((len(sample), len(xs)))
for i, (slope, inter) in enumerate(sample):
    pred[i] = expit(inter + slope * xs)
low, median, high = np.percentile(pred, [5, 50, 95], axis=0)
plt.fill_between(temps, low, high, color='C1', alpha=0.2, label='95% 신뢰구간')
plt.plot(temps, median, color='C1', label='logistic model')
plt.legend()
plt.scatter(df['x'] + offset, df['y'], label='data')

맨 위로