from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
= load_iris()
iris = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
columns = pd.DataFrame(iris.data, columns=columns)
df 'target'] = iris.target
df[= ['^', 's', 'o']
markers
for i, marker in enumerate(markers):
= df[df['target'] == i]['sepal_length']
x_axis_data = df[df['target'] == i]['sepal_width']
y_axis_data =marker, label=iris.target_names[i])
plt.scatter(x_axis_data, y_axis_data, marker
plt.legend()'sepal length')
plt.xlabel('sepal width')
plt.ylabel( plt.show()
차원 축소
머신 러닝
PCA
- PCA는 scaling의 영향을 받음.
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
= StandardScaler().fit_transform(df.iloc[:, :-1])
scaled_df
= PCA(n_components=2)
pca = pca.fit_transform(scaled_df) df
= ['pca_component_1', 'pca_component_2']
pca_columns = pd.DataFrame(df, columns=pca_columns)
df 'target'] = iris.target df[
= ['^', 's', 'o']
markers
for i, marker in enumerate(markers):
= df[df['target'] == i]['pca_component_1']
x_axis_data = df[df['target'] == i]['pca_component_2']
y_axis_data =marker, label=iris.target_names[i])
plt.scatter(x_axis_data, y_axis_data, marker
plt.legend()'pca_component_1')
plt.xlabel('pca_component_2')
plt.ylabel( plt.show()
신용카드 고객 데이터
= pd.read_excel('_data/creadit_card.xls', header=1, sheet_name='Data').iloc[:, 1:]
df ={'PAY_0': 'PAY_1', 'default payment next month': 'default'}, inplace=True)
df.rename(columns= df['default']
target = df.drop('default', axis=1) features
import seaborn as sns
= features.corr()
corr =True, fmt='.1g') sns.heatmap(corr, annot
- BILL_AMT1~6, PAY_1~6의 상관도가 높다.
= ['BILL_AMT' + str(i) for i in range(1, 7)]
cols_bill = StandardScaler()
scaler = scaler.fit_transform(features[cols_bill])
df_cols_scaled = PCA(n_components=2)
pca
pca.fit(df_cols_scaled) pca.explained_variance_ratio_
array([0.90555253, 0.0509867 ])
- PCA 할 때 column 전부 다 안 넣어도 되나?
- 다 넣어야 하는 듯
LDA
- 클래스 분리를 최대화하는 축을 찾음
- PCA와 다르게 지도 학습임.
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
= load_iris()
iris = StandardScaler().fit_transform(iris.data) iris_scaled
= LinearDiscriminantAnalysis(n_components=2)
lda = lda.fit_transform(iris_scaled, iris.target) iris_lda
= ['lda_components_1', 'lda_components_2']
lda_columns = pd.DataFrame(iris_lda, columns=lda_columns)
df 'target'] = iris.target
df[
= ['^', 's', 'o']
markers
for i, marker in enumerate(markers):
= df[df['target'] == i]['lda_components_1']
x_axis_data = df[df['target'] == i]['lda_components_2']
y_axis_data =marker, label=iris.target_names[i])
plt.scatter(x_axis_data, y_axis_data, marker
plt.legend()'lda_components_1')
plt.xlabel('lda_components_2')
plt.ylabel( plt.show()