import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataset = pd.read_csv('_data/00-data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].valuesdata preprocessing
machine learning
![]()
Load Library and data
xarray([['France', 44.0, 72000.0],
['Spain', 27.0, 48000.0],
['Germany', 30.0, 54000.0],
['Spain', 38.0, 61000.0],
['Germany', 40.0, nan],
['France', 35.0, 58000.0],
['Spain', nan, 52000.0],
['France', 48.0, 79000.0],
['Germany', 50.0, 83000.0],
['France', 37.0, 67000.0]], dtype=object)
yarray(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
dtype=object)
Taking care of Missing data
- delete
- replace
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])
print(x)[['France' 44.0 72000.0]
['Spain' 27.0 48000.0]
['Germany' 30.0 54000.0]
['Spain' 38.0 61000.0]
['Germany' 40.0 63777.77777777778]
['France' 35.0 58000.0]
['Spain' 38.77777777777778 52000.0]
['France' 48.0 79000.0]
['Germany' 50.0 83000.0]
['France' 37.0 67000.0]]
Encoding Cagegorical data
- 단순히 categorical 변수를 1, 2, 3으로 변형하면 순서가 고려된 것으로 간주될 수 있다.
- 그래서 [0, 0, 1], [1, 0, 1] 이런 식으로 one hot encoding을 진행한다.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)[[1.0 0.0 0.0 44.0 72000.0]
[0.0 0.0 1.0 27.0 48000.0]
[0.0 1.0 0.0 30.0 54000.0]
[0.0 0.0 1.0 38.0 61000.0]
[0.0 1.0 0.0 40.0 63777.77777777778]
[1.0 0.0 0.0 35.0 58000.0]
[0.0 0.0 1.0 38.77777777777778 52000.0]
[1.0 0.0 0.0 48.0 79000.0]
[0.0 1.0 0.0 50.0 83000.0]
[1.0 0.0 0.0 37.0 67000.0]]
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)[0 1 0 0 1 1 0 1 0 1]
Split dataset into training set and test set
- feature scaling 이전에 진행되어야함. (test set은 모델이 모르는 정보가 되야하기 때문)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])print(X_train)[[0.0 1.0 0.0 1.589843519888049 1.6688118414569673]
[0.0 1.0 0.0 0.11473097566202425 -0.017053551664523183]
[0.0 0.0 1.0 -1.8029153318318079 -1.4008274581573077]
[0.0 0.0 1.0 -0.06556055752115642 -1.0500115382013906]
[1.0 0.0 0.0 -0.32780278760578313 0.26554816163329864]
[0.0 0.0 1.0 -0.18029153318318067 -0.260675718300577]
[1.0 0.0 0.0 1.294821011042844 1.31799592150105]
[1.0 0.0 0.0 -0.622825296450988 -0.5237876582675148]]
print(X_test)[[0.0 1.0 0.0 -1.3603815685640004 -0.874603578223432]
[1.0 0.0 0.0 0.7047759933524341 0.704068061578195]]