import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
= load_wine()
wine_load = pd.DataFrame(wine_load.data, columns=wine_load.feature_names)
wine 'class'] = wine_load.target
wine['class'] = wine['class'].map({0: 'class_0', 1: 'class_1', 2: 'class_2'})
wine[
'color_intensity'], whis=1.5)
plt.boxplot(wine['Boxplot of color_intensity')
plt.title( plt.show()
데이터 전처리
데이터 분석
데이터 전처리의 의미
- 데이터 클리닝
- 데이터 통합
- 데이터 변환
- 데이터 축소
- 불균형 데이터 처리
- 데이터 분할
이상치 확인 및 정제
이상치 확인
import numpy as np
def outliers_iqr(dt, col):
= np.percentile(dt[col], [25, 75])
q1, q3 = q3 - q1
iqr = q1 - (iqr * 1.5)
lower_bound = q3 + (iqr * 1.5)
upper_bound return dt[(dt[col] < lower_bound) | (dt[col] > upper_bound)]
= outliers_iqr(wine, 'color_intensity')
outliers outliers
alcohol | malic_acid | ash | alcalinity_of_ash | magnesium | total_phenols | flavanoids | nonflavanoid_phenols | proanthocyanins | color_intensity | hue | od280/od315_of_diluted_wines | proline | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
151 | 12.79 | 2.67 | 2.48 | 22.0 | 112.0 | 1.48 | 1.36 | 0.24 | 1.26 | 10.80 | 0.48 | 1.47 | 480.0 | class_2 |
158 | 14.34 | 1.68 | 2.70 | 25.0 | 98.0 | 2.80 | 1.31 | 0.53 | 2.70 | 13.00 | 0.57 | 1.96 | 660.0 | class_2 |
159 | 13.48 | 1.67 | 2.64 | 22.5 | 89.0 | 2.60 | 1.10 | 0.52 | 2.29 | 11.75 | 0.57 | 1.78 | 620.0 | class_2 |
166 | 13.45 | 3.70 | 2.60 | 23.0 | 111.0 | 1.70 | 0.92 | 0.43 | 1.46 | 10.68 | 0.85 | 1.56 | 695.0 | class_2 |
이상치 정제
- 이상치 제거
= wine.drop(index=outliers.index)
drop_outliers
print("Original:", wine.shape)
print("Drop outliers:", drop_outliers.shape)
Original: (178, 14)
Drop outliers: (174, 14)
- 이상치 대체
이상치를 NULL로 만든 후, 결측치와 함께 대체
'color_intensity'] = np.NaN
wine.loc[outliers.index,
'color_intensity'].fillna(wine['color_intensity'].mean(), inplace=True)
wine['color_intensity'] wine.loc[outliers.index,
/tmp/ipykernel_11221/3568685677.py:3: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
151 4.908678
158 4.908678
159 4.908678
166 4.908678
Name: color_intensity, dtype: float64