Multiple Linear Regression

machine learning

공개

2025년 2월 26일

preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('_data/03.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# model automatically avoid dummy variable trap
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.2)

# in multiple linear regression, we don't need to apply feature scaling

modeling

from sklearn.linear_model import LinearRegression

# model automatically choose best model (dont need to apply 후진제거법)
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

predict

y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), 
                      y_test.reshape(len(y_test), 1)), 1))

[[ 93771.16 101004.64]
 [ 74466.6   78239.91]
 [ 50652.19  69758.98]
 [105956.49  99937.59]
 [145441.49 129917.04]
 [ 37211.29  64926.08]
 [ 71764.09  71498.49]
 [111249.19 108552.04]
 [ 59299.22  65200.33]
 [120840.79 118474.03]
 [162627.18 149759.96]
 [120230.67 126992.93]
 [166842.52 156991.12]
 [119593.28 108733.99]
 [123155.58 110352.25]
 [170183.11 155752.6 ]
 [198552.13 191050.39]
 [157465.39 132602.65]
 [ 98810.02  97427.84]
 [195490.18 192261.83]
 [142926.02 144259.4 ]
 [106210.25  96778.92]
 [ 89245.08  97483.56]
 [ 45068.42  14681.4 ]
 [ 41277.39  42559.73]
 [115735.66 105733.54]
 [176218.16 182901.99]
 [137986.12 141585.52]
 [ 88264.34  96479.51]
 [ 96578.58  89949.14]
 [ 82567.67  77798.83]
 [196724.5  191792.06]
 [111704.   111313.02]
 [102560.86 103282.38]
 [139310.55 124266.9 ]
 [ 87261.82  81005.76]
 [ 86587.2   96712.8 ]
 [107978.65 122776.86]
 [130333.68 134307.35]
 [188346.5  166187.94]]

evaluate

from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.913139010635797

맨 위로