import pandas as pd
import warnings
warnings.filterwarnings('ignore')
dataset = pd.read_csv("./_data/class/FFvote.csv", encoding='utf-8')
dataset.head()
0 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
1.0 |
0.666667 |
0.666667 |
0.25 |
0.25 |
0.75 |
1 |
2 |
1 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0.5 |
0.666667 |
0.666667 |
0.25 |
0.75 |
0.50 |
0 |
3 |
2 |
2 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0.0 |
0.333333 |
1.000000 |
0.00 |
0.50 |
0.45 |
1 |
4 |
3 |
3 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
0.5 |
0.000000 |
0.666667 |
1.00 |
0.75 |
0.40 |
1 |
1 |
4 |
4 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0.0 |
0.333333 |
1.000000 |
0.75 |
0.50 |
0.35 |
1 |
1 |
X = dataset.loc[:, 'gender_female':'score_intention'].values
y = dataset['vote'].values
어떻게 하면 scaling을 간단하게 처리할 수 있을까 고민하던 중 sklearn pipeline 문서의 Safety 부분을 참고해서 작성해봤습니다.
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(
StandardScaler(),
KNeighborsClassifier(n_neighbors=42)
)
for k in (3, 4, 5):
kf = KFold(n_splits=k, shuffle=False)
accuracies = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')
print(f"Accuracy for K={k}: {round(accuracies.mean(), 2)}")
Accuracy for K=3: 0.71
Accuracy for K=4: 0.71
Accuracy for K=5: 0.72
맨 위로