모델링, 평가 템플릿

데이터 분석

공개

2025년 9월 21일

모델 정의

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features), # outlier가 많다면 RobustScaler를 고려하자
    ('cat', OneHotEncoder(drop='first'), cat_features),
    ('ord', OrdinalEncoder(), ord_features) # 순서형 변수. 하지만 이렇게 처리하는 것보다 그냥 DataFrame.map()으로 직접 인코딩해주는게 더 나을듯
])
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model_name', YourModel(random_state=42))
])

category 형을 있는 그대로 처리하고 싶다면, pipeline보다는 LGBM, CatBoost, XGBoost 등의 모델을 바로 사용하는게 더 나음.
- sklearn은 데이터들을 numpy array로 변환하기 때문에 category 형을 유지하지 못함

Grid Search & Bayesian Optimization

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# 파라미터 이름 앞에 pipeline에서 사용한 '모델이름__'을 붙여야 함
params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 50, 100],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=params, 
                           cv=5, 
                           scoring='accuracy') # 분류: accuracy, f1, roc_auc, f1_macro, roc_auc_ovr, roc_auc_ovo, 회귀: neg_root_mean_squared_error, r2, neg_mean_absolute_error
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

# ============ 분류 scoring ============

y_pred_proba = best_model.predict_proba(X_test)[:, 1] # 다중클래스인 경우 [:, 1] 제거

test_acc = accuracy_score(y_test, y_pred)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    y_test, 
    y_pred, 
    average='binary' # 다중 클래스인 경우 'macro', 'micro', 'weighted' 중 선택
)
test_auc = roc_auc_score(y_test, y_pred_proba) # 다중 클래스인 경우 multi_class='ovr', 'ovo' 중 선택

print(f"test_accuracy: {test_acc:.4f}")
print(f"test_precision: {test_precision:.4f}")
print(f"test_recall: {test_recall:.4f}")
print(f"test_f1_score: {test_f1:.4f}")
print(f"test_auc: {test_auc:.4f}")

# ============ 회귀 scoring ============

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"test_rmse: {rmse:.4f}")
print(f"test_mae: {mae:.4f}")
print(f"test_r2: {r2:.4f}")

👇 시험에서 사용 불가. 하지만 kaggle이나 dacon에서 주로 사용됨

import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

def objective(trial):
    # 모델에 적합한 파라미터에 맞게 수정
    params = {
        'classifier__n_estimators': trial.suggest_int("n_estimators", 100, 500, 100),
        'classifier__max_features': trial.suggest_categorical("max_features", ['sqrt', 'log2']),
        'classifier__max_depth': trial.suggest_int("max_depth", 10, 110, 20),
        'classifier__min_samples_split': trial.suggest_int("min_samples_split", 2, 10, 2),
        'classifier__min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 4, 1)
    }
    pipeline.set_params(**params)

    cv_score = cross_val_score(pipeline,
                               X_train,
                               y_train,
                               cv=5,
                               scoring='accuracy') # 분류: accuracy, f1, roc_auc, f1_macro, roc_auc_ovr, roc_auc_ovo, 회귀: neg_root_mean_squared_error, r2, neg_mean_absolute_error
    mean_cv_accuracy = cv_score.mean()
    return mean_cv_accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

pipeline.set_params(**study.best_params)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# ============ 분류 scoring ============

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]


test_acc = accuracy_score(y_test, y_pred)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    y_test, 
    y_pred, 
    average='binary' # 다중 클래스인 경우 'macro', 'micro', 'weighted' 중 선택해서 사용
)
test_auc = roc_auc_score(y_test, y_pred_proba) # 다중 클래스인 경우 multiclass='ovr', 'ovo' 중 선택해서 사용

print("test_accuracy:", test_acc)
print("test_precision:", test_precision)
print("test_recall:", test_recall)
print("test_f1_score:", test_f1)
print("test_auc:", test_auc)

# ============ 회귀 scoring ============

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"test_rmse: {rmse:.4f}")
print(f"test_mae: {mae:.4f}")
print(f"test_r2: {r2:.4f}")

👇 시험에서 사용 가능

from hyperopt import hp, STATUS_OK, fmin, tpe, Trials
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# 파라미터 공간 정의
search_space = {
    'classifier__n_estimators': hp.quniform('n_estimators', 100, 500, 100),
    'classifier__max_features': hp.choice('max_features', ['sqrt', 'log2']),
    'classifier__max_depth': hp.quniform('max_depth', 10, 110, 20),
    'classifier__min_samples_split': hp.quniform('min_samples_split', 2, 10, 2),
    'classifier__min_samples_leaf': hp.quniform('min_samples_leaf', 1, 4, 1)
}

def objective_func(params):
    # 파라미터 타입 변환 (hyperopt는 float로 반환하므로 int로 변환 필요)
    params_int = {
        'classifier__n_estimators': int(params['classifier__n_estimators']),
        'classifier__max_features': params['classifier__max_features'],
        'classifier__max_depth': int(params['classifier__max_depth']),
        'classifier__min_samples_split': int(params['classifier__min_samples_split']),
        'classifier__min_samples_leaf': int(params['classifier__min_samples_leaf'])
    }
    pipeline.set_params(**params_int)
    
    cv_score = cross_val_score(pipeline,
                               X_train,
                               y_train,
                               cv=5,
                               scoring='accuracy') # 분류: accuracy, f1, roc_auc, f1_macro, roc_auc_ovr, roc_auc_ovo, 회귀: neg_root_mean_squared_error, r2, neg_mean_absolute_error
    mean_cv_score = cv_score.mean()
    
    # hyperopt는 최소화하므로 음수 반환 (최대화하려는 경우)
    return {'loss': -mean_cv_score, 'status': STATUS_OK}

# 최적화 실행
trials = Trials()
best = fmin(fn=objective_func,
            space=search_space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

# 최적 파라미터 적용 (타입 변환 포함)
best_params = {
    'classifier__n_estimators': int(best['n_estimators']),
    'classifier__max_features': ['sqrt', 'log2'][int(best['max_features'])],
    'classifier__max_depth': int(best['max_depth']),
    'classifier__min_samples_split': int(best['min_samples_split']),
    'classifier__min_samples_leaf': int(best['min_samples_leaf'])
}

pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# ============ 분류 scoring ============

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

test_acc = accuracy_score(y_test, y_pred)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    y_test, 
    y_pred, 
    average='binary' # 다중 클래스인 경우 'macro', 'micro', 'weighted' 중 선택해서 사용
)
test_auc = roc_auc_score(y_test, y_pred_proba) # 다중 클래스인 경우 multiclass='ovr', 'ovo' 중 선택해서 사용

print("test_accuracy:", test_acc)
print("test_precision:", test_precision)
print("test_recall:", test_recall)
print("test_f1_score:", test_f1)
print("test_auc:", test_auc)

# ============ 회귀 scoring ============

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"test_rmse: {rmse:.4f}")
print(f"test_mae: {mae:.4f}")
print(f"test_r2: {r2:.4f}")

모델 평가 visualization

ROC AUC

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# ROC 곡선 계산
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # 양성 클래스 확률
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# ROC 곡선 시각화
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

Feature Importance

import matplotlib.pyplot as plt
import seaborn as sns

importances = pipeline.named_steps['classifier'].feature_importances_
ftr_importance = pd.Series(importances, index=X.columns)
ftr_top = ftr_importance.sort_values(ascending=False)[:20] # 원하는 수 만큼 설정
sns.barplot(ftr_top20, y=ftr_top.index)
plt.show()

Drop Column importance

from sklearn.base import clone

for col in X_train.columns:
    X_train_dropped = X_train.drop(columns=[col])
    X_test_dropped = X_test.drop(columns=[col])

    # 시간의 단축을 위해 하이퍼파라미터는 그냥 그대로 사용
    # 엄밀하게 하고 싶다면 각 iteration마다 다시 튜닝
    model_dropped = clone(best_model)
    model_dropped.fit(X_train_dropped, y_train)

    score = model_dropped.score(X_test_dropped, y_test)
    print(f"{col}'s difference:", best_model.score(X_test, y_test) - score) # 혹은 다른 평가 지표 사용

Permutation importance

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(best_model).fit(X_test, y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())

음수 값은 제거해도 된다는 뜻
단점: 상관관계가 높은 feature에 대해서 비현실적인 데이터 조합이 생성될 가능성이 높다.
- (ex. shuffle을 통해 키 180cm, 몸무게 30kg의 데이터가 조합되는 경우)

ensemble

stacking:
- 여러 모델을 학습시킨 후, 각 모델의 예측 결과를 입력으로 하는 메타 모델을 학습시킨다.
- 메타 모델은 다른 모델들의 예측 결과를 종합하여 최종 예측을 수행한다.
voting, averaging:
- 여러 모델을 학습시킨 후, 각 모델의 예측 결과를 투표(voting)하거나 평균(averaging)하여 최종 예측을 수행한다.
- 분류 문제에서는 다수결 투표(hard voting) 또는 확률 평균(soft voting)을 사용하고, 회귀 문제에서는 단순 평균 또는 가중 평균을 사용한다.
bagging
- vs cross validation:
  - cross validation은 이미 생성된 모델을 검증하기 위한 방법. 모델 구축 방법은 아님
  - bagging은 분산을 줄이기 위해 사용함
boosting: sequentially 학습
- 이전 모델의 오차를 보완하는 방식으로 학습한다.

from sklearn.ensemble import StackingClassifier, StackingRegressor

stacking_lf = StackingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier())
], final_estimator=LogisticRegression())

stacking_rf = StackingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor()),
    ('rf', RandomForestRegressor())
], final_estimator=LinearRegression())

stacking_lf.fit(X_train, y_train)
stacking_rf.fit(X_train, y_train)
stacking_lf.predict(X_test)
stacking_rf.predict(X_test)

from sklearn.ensemble import VotingClassifier, VotingRegressor

voting_lf = VotingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier())
], voting='soft') # or hard


voting_rf = VotingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor()),
    ('rf', RandomForestRegressor())
], weights=[1, 1, 2])


voting_lf.fit(X_train, y_train)
voting_rf.fit(X_train, y_train)
voting_lf.predict(X_test)
voting_rf.predict(X_test)

모델 파라미터

아래부터는 ai의 도움을 받았습니다.
적당히 몇 개만 골라서 사용하면 됩니다.

Logistic 회귀분석

# Grid Search용
params = {
    "classifier__penalty": ['l1', 'l2', 'elasticnet', 'none'],  # 정규화 방법
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100],  # 정규화 강도 (작을수록 강한 정규화)
    "classifier__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],  # elasticnet일 때만 사용
    "classifier__solver": ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],  # 최적화 알고리즘
    "classifier__max_iter": [100, 500, 1000, 2000]  # 최대 반복 횟수
}

# Bayesian Optimization용 (hyperopt)
params_bayes = {
    "classifier__penalty": hp.choice("penalty", ['l1', 'l2', 'elasticnet']),
    "classifier__C": hp.loguniform("C", np.log(0.001), np.log(100)),
    "classifier__l1_ratio": hp.uniform("l1_ratio", 0.1, 0.9),
    "classifier__solver": hp.choice("solver", ['liblinear', 'saga']),
    "classifier__max_iter": hp.quniform("max_iter", 100, 2000, 100)
}

KNN

# Grid Search용
params = {
    "classifier__n_neighbors": [3, 5, 7, 9, 11, 15, 21],  # k값 (이웃의 수)
    "classifier__weights": ['uniform', 'distance'],  # 가중치 방법
    "classifier__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],  # 알고리즘
    "classifier__leaf_size": [20, 30, 40, 50],  # 리프 크기 (ball_tree, kd_tree용)
    "classifier__p": [1, 2],  # 거리 측도 (1: 맨하탄, 2: 유클리드)
    "classifier__metric": ['minkowski', 'euclidean', 'manhattan']  # 거리 함수
}

# Bayesian Optimization용 (hyperopt)
params_bayes = {
    "classifier__n_neighbors": hp.quniform("n_neighbors", 3, 21, 2),
    "classifier__weights": hp.choice("weights", ['uniform', 'distance']),
    "classifier__algorithm": hp.choice("algorithm", ['auto', 'ball_tree', 'kd_tree']),
    "classifier__leaf_size": hp.quniform("leaf_size", 20, 50, 10),
    "classifier__p": hp.choice("p", [1, 2]),
    "classifier__metric": hp.choice("metric", ['minkowski', 'euclidean', 'manhattan'])
}

Support Vector Machine

# Grid Search용
params = {
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # 정규화 파라미터 (작을수록 강한 정규화)
    "classifier__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],  # 커널 함수
    "classifier__degree": [2, 3, 4, 5],  # poly 커널 차수 (poly일 때만 사용)
    "classifier__gamma": ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],  # 커널 계수 (rbf, poly, sigmoid용)
    "classifier__coef0": [0.0, 0.1, 0.5, 1.0],  # 커널의 독립항 (poly, sigmoid용)
    "classifier__shrinking": [True, False],  # 수축 휴리스틱 사용 여부
    "classifier__tol": [1e-5, 1e-4, 1e-3],  # 정지 기준 허용 오차
    "classifier__max_iter": [100, 500, 1000, 2000, -1]  # 최대 반복 횟수 (-1: 제한 없음)
}

# Bayesian Optimization용 (hyperopt)
params_bayes = {
    "classifier__C": hp.loguniform("C", np.log(0.001), np.log(1000)),
    "classifier__kernel": hp.choice("kernel", ['linear', 'poly', 'rbf', 'sigmoid']),
    "classifier__degree": hp.quniform("degree", 2, 5, 1),  # poly일 때만
    "classifier__gamma": hp.choice("gamma", ['scale', 'auto'] + [hp.loguniform("gamma_float", np.log(0.001), np.log(10))]),
    "classifier__coef0": hp.uniform("coef0", 0.0, 1.0),
    "classifier__shrinking": hp.choice("shrinking", [True, False]),
    "classifier__tol": hp.loguniform("tol", np.log(1e-5), np.log(1e-3)),
    "classifier__max_iter": hp.quniform("max_iter", 100, 2000, 100)
}

Random Forest

# Grid Search용
params = {
    "classifier__n_estimators": [50, 100, 200, 300, 500],  # 트리 개수
    "classifier__criterion": ['gini', 'entropy', 'log_loss'],  # 불순도 측정 기준
    "classifier__max_depth": [None, 5, 10, 20, 30, 50],  # 트리의 최대 깊이 (None: 제한 없음)
    "classifier__min_samples_split": [2, 5, 10, 20],  # 내부 노드 분할에 필요한 최소 샘플 수
    "classifier__min_samples_leaf": [1, 2, 4, 10],  # 리프 노드에 필요한 최소 샘플 수
    "classifier__min_weight_fraction_leaf": [0.0, 0.1, 0.2],  # 리프 노드에 필요한 최소 가중치 비율
    "classifier__max_features": ['sqrt', 'log2', None, 0.3, 0.5, 0.7],  # 각 분할에서 고려할 피처 수
    "classifier__max_leaf_nodes": [None, 50, 100, 200],  # 최대 리프 노드 수
    "classifier__min_impurity_decrease": [0.0, 0.01, 0.05, 0.1],  # 분할에 필요한 최소 불순도 감소량
    "classifier__bootstrap": [True, False],  # 부트스트랩 사용 여부
    "classifier__oob_score": [True, False],  # OOB 스코어 계산 여부
    "classifier__max_samples": [None, 0.5, 0.7, 0.9],  # 각 트리에 사용할 샘플 비율
    "classifier__ccp_alpha": [0.0, 0.01, 0.05, 0.1]  # 복잡도 가지치기 파라미터
}

# Bayesian Optimization용 (hyperopt)
params_bayes = {
    "classifier__n_estimators": hp.quniform("n_estimators", 50, 500, 50),
    "classifier__criterion": hp.choice("criterion", ['gini', 'entropy']),
    "classifier__max_depth": hp.choice("max_depth", [None, hp.quniform("max_depth_val", 5, 50, 5)]),
    "classifier__min_samples_split": hp.quniform("min_samples_split", 2, 20, 2),
    "classifier__min_samples_leaf": hp.quniform("min_samples_leaf", 1, 10, 1),
    "classifier__max_features": hp.choice("max_features", ['sqrt', 'log2', None]),
    "classifier__min_impurity_decrease": hp.uniform("min_impurity_decrease", 0.0, 0.1),
    "classifier__bootstrap": hp.choice("bootstrap", [True, False]),
    "classifier__max_samples": hp.choice("max_samples", [None, hp.uniform("max_samples_val", 0.5, 1.0)]),
    "classifier__ccp_alpha": hp.uniform("ccp_alpha", 0.0, 0.1)
}

XGBOOST

# Grid Search용
params = {
    "classifier__n_estimators": [50, 100, 200, 300, 500],  # 부스팅 라운드 수
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # 학습률 (eta)
    "classifier__max_depth": [3, 4, 5, 6, 7, 8],  # 트리 최대 깊이
    "classifier__min_child_weight": [1, 3, 5, 7],  # 리프 노드의 최소 가중치 합
    "classifier__gamma": [0, 0.1, 0.2, 0.3, 0.4],  # 분할에 필요한 최소 손실 감소
    "classifier__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],  # 행 샘플링 비율
    "classifier__colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],  # 열 샘플링 비율 (트리별)
    "classifier__colsample_bylevel": [0.6, 0.7, 0.8, 0.9, 1.0],  # 열 샘플링 비율 (레벨별)
    "classifier__colsample_bynode": [0.6, 0.7, 0.8, 0.9, 1.0],  # 열 샘플링 비율 (노드별)
    "classifier__reg_alpha": [0, 0.01, 0.1, 1, 10],  # L1 정규화 파라미터
    "classifier__reg_lambda": [0, 0.01, 0.1, 1, 10],  # L2 정규화 파라미터
    "classifier__max_delta_step": [0, 1, 2, 5, 10],  # 각 트리 가중치 변화의 최대값
    "classifier__scale_pos_weight": [1, 2, 3, 5]  # 양성 클래스 가중치 (불균형 데이터용)
}

# Bayesian Optimization용 (hyperopt)
params_bayes = {
    "classifier__n_estimators": hp.quniform("n_estimators", 50, 500, 50),
    "classifier__learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
    "classifier__max_depth": hp.quniform("max_depth", 3, 8, 1),
    "classifier__min_child_weight": hp.quniform("min_child_weight", 1, 7, 1),
    "classifier__gamma": hp.uniform("gamma", 0, 0.4),
    "classifier__subsample": hp.uniform("subsample", 0.6, 1.0),
    "classifier__colsample_bytree": hp.uniform("colsample_bytree", 0.6, 1.0),
    "classifier__colsample_bylevel": hp.uniform("colsample_bylevel", 0.6, 1.0),
    "classifier__colsample_bynode": hp.uniform("colsample_bynode", 0.6, 1.0),
    "classifier__reg_alpha": hp.loguniform("reg_alpha", np.log(0.01), np.log(10)),
    "classifier__reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10)),
    "classifier__max_delta_step": hp.quniform("max_delta_step", 0, 10, 1),
    "classifier__scale_pos_weight": hp.quniform("scale_pos_weight", 1, 5, 1)
}

LightGBM

# Grid Search용
params = {
    "classifier__n_estimators": [50, 100, 200, 300, 500],  # 부스팅 라운드 수
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # 학습률
    "classifier__max_depth": [3, 5, 7, 10, -1],  # 트리 최대 깊이 (-1: 제한 없음)
    "classifier__num_leaves": [15, 31, 63, 127, 255],  # 리프 노드 수 (2^max_depth - 1보다 작게)
    "classifier__min_child_samples": [10, 20, 30, 50],  # 리프 노드의 최소 샘플 수
    "classifier__min_child_weight": [1e-3, 1e-2, 1e-1, 1],  # 리프 노드의 최소 가중치 합
    "classifier__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],  # 행 샘플링 비율
    "classifier__colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],  # 열 샘플링 비율
    "classifier__reg_alpha": [0, 0.01, 0.1, 1, 10],  # L1 정규화
    "classifier__reg_lambda": [0, 0.01, 0.1, 1, 10],  # L2 정규화
    "classifier__min_split_gain": [0, 0.01, 0.1, 0.5],  # 분할에 필요한 최소 gain
    "classifier__min_data_in_leaf": [10, 20, 50, 100],  # 리프의 최소 데이터 수
    "classifier__boosting_type": ['gbdt', 'dart', 'goss'],  # 부스팅 타입
    "classifier__feature_fraction": [0.6, 0.7, 0.8, 0.9, 1.0],  # 피처 샘플링 비율
    "classifier__bagging_fraction": [0.6, 0.7, 0.8, 0.9, 1.0],  # 데이터 샘플링 비율
    "classifier__bagging_freq": [0, 1, 3, 5]  # 배깅 빈도
}

# Bayesian Optimization용 (hyperopt)
params_bayes = {
    "classifier__n_estimators": hp.quniform("n_estimators", 50, 500, 50),
    "classifier__learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
    "classifier__max_depth": hp.choice("max_depth", [-1, hp.quniform("max_depth_val", 3, 10, 1)]),
    "classifier__num_leaves": hp.quniform("num_leaves", 15, 255, 16),
    "classifier__min_child_samples": hp.quniform("min_child_samples", 10, 50, 10),
    "classifier__min_child_weight": hp.loguniform("min_child_weight", np.log(1e-3), np.log(1)),
    "classifier__subsample": hp.uniform("subsample", 0.6, 1.0),
    "classifier__colsample_bytree": hp.uniform("colsample_bytree", 0.6, 1.0),
    "classifier__reg_alpha": hp.loguniform("reg_alpha", np.log(0.01), np.log(10)),
    "classifier__reg_lambda": hp.loguniform("reg_lambda", np.log(0.01), np.log(10)),
    "classifier__min_split_gain": hp.uniform("min_split_gain", 0, 0.5),
    "classifier__min_data_in_leaf": hp.quniform("min_data_in_leaf", 10, 100, 10),
    "classifier__boosting_type": hp.choice("boosting_type", ['gbdt', 'dart', 'goss']),
    "classifier__feature_fraction": hp.uniform("feature_fraction", 0.6, 1.0),
    "classifier__bagging_fraction": hp.uniform("bagging_fraction", 0.6, 1.0),
    "classifier__bagging_freq": hp.quniform("bagging_freq", 0, 5, 1)
}

Catboost

# Grid Search용
params = {
    "classifier__n_estimators": [50, 100, 200, 300, 500],  # 부스팅 라운드 수
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # 학습률
    "classifier__depth": [3, 4, 5, 6, 7, 8],  # 트리 깊이 (max_depth 대신 depth 사용)
    "classifier__l2_leaf_reg": [1, 3, 5, 10, 20],  # L2 정규화 파라미터
    "classifier__border_count": [32, 64, 128, 255],  # 수치형 피처의 분할점 개수
    "classifier__bagging_temperature": [0, 0.5, 1, 2, 5],  # 배깅 온도 (0: 비활성화)
    "classifier__random_strength": [1, 2, 5, 10],  # 트리 구조의 무작위성
    "classifier__od_type": ['IncToDec', 'Iter'],  # 조기 종료 타입
    "classifier__od_wait": [10, 20, 50],  # 조기 종료 대기 라운드
    "classifier__bootstrap_type": ['Bayesian', 'Bernoulli', 'MVS', 'Poisson'],  # 부트스트랩 타입
    "classifier__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],  # 샘플링 비율 (Bernoulli일 때만)
    "classifier__rsm": [0.5, 0.7, 0.9, 1.0],  # 무작위 서브스페이스 방법 (피처 샘플링)
    "classifier__leaf_estimation_iterations": [1, 3, 5, 10],  # 리프 값 추정 반복 횟수
    "classifier__grow_policy": ['SymmetricTree', 'Depthwise', 'Lossguide'],  # 트리 성장 정책
    "classifier__min_data_in_leaf": [1, 5, 10, 20],  # 리프 노드의 최소 샘플 수
    "classifier__max_leaves": [16, 31, 64, 127]  # 최대 리프 수 (Lossguide일 때만)
}

# Bayesian Optimization용 (hyperopt)
params_bayes = {
    "classifier__n_estimators": hp.quniform("n_estimators", 50, 500, 50),
    "classifier__learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
    "classifier__depth": hp.quniform("depth", 3, 8, 1),
    "classifier__l2_leaf_reg": hp.quniform("l2_leaf_reg", 1, 20, 1),
    "classifier__border_count": hp.choice("border_count", [32, 64, 128, 255]),
    "classifier__bagging_temperature": hp.uniform("bagging_temperature", 0, 5),
    "classifier__random_strength": hp.quniform("random_strength", 1, 10, 1),
    "classifier__od_type": hp.choice("od_type", ['IncToDec', 'Iter']),
    "classifier__od_wait": hp.quniform("od_wait", 10, 50, 10),
    "classifier__bootstrap_type": hp.choice("bootstrap_type", ['Bayesian', 'Bernoulli', 'MVS']),
    "classifier__subsample": hp.uniform("subsample", 0.6, 1.0),
    "classifier__rsm": hp.uniform("rsm", 0.5, 1.0),
    "classifier__leaf_estimation_iterations": hp.quniform("leaf_estimation_iterations", 1, 10, 1),
    "classifier__grow_policy": hp.choice("grow_policy", ['SymmetricTree', 'Depthwise', 'Lossguide']),
    "classifier__min_data_in_leaf": hp.quniform("min_data_in_leaf", 1, 20, 1),
    "classifier__max_leaves": hp.quniform("max_leaves", 16, 127, 1)
}

# 주요 특징:
# - GPU 지원 (gpu_device_id=-1로 설정하면 GPU 사용)
# - 범주형 변수 자동 처리 (cat_features 파라미터로 지정)
# - 내장된 교차검증 및 조기 종료
# - 텍스트 및 임베딩 피처 지원

맨 위로