Homework - 2

data mining

공개

2025년 4월 10일

import pandas as pd
import warnings

warnings.filterwarnings('ignore')

dataset = pd.read_csv("./_data/class/FFvote.csv", encoding='utf-8')
dataset.head()

	Unnamed: 0	gender_female	gender_male	region_Honam	region_Sudo	region_Youngnam	region_others	edu	income	age	score_gov	score_progress	score_intention	vote	parties
0	0	0	1	0	0	1	0	1.0	0.666667	0.666667	0.25	0.25	0.75	1	2
1	1	0	1	0	0	0	1	0.5	0.666667	0.666667	0.25	0.75	0.50	0	3
2	2	0	1	1	0	0	0	0.0	0.333333	1.000000	0.00	0.50	0.45	1	4
3	3	1	0	0	1	0	0	0.5	0.000000	0.666667	1.00	0.75	0.40	1	1
4	4	0	1	0	1	0	0	0.0	0.333333	1.000000	0.75	0.50	0.35	1	1

X = dataset.loc[:, 'gender_female':'score_intention'].values
y = dataset['vote'].values

어떻게 하면 scaling을 간단하게 처리할 수 있을까 고민하던 중 sklearn pipeline 문서의 Safety 부분을 참고해서 작성해봤습니다.

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
      StandardScaler(),
      KNeighborsClassifier(n_neighbors=42)
)

for k in (3, 4, 5):
      kf = KFold(n_splits=k, shuffle=False)
      accuracies = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')
      print(f"Accuracy for K={k}: {round(accuracies.mean(), 2)}")

Accuracy for K=3: 0.71
Accuracy for K=4: 0.71
Accuracy for K=5: 0.72

맨 위로