개요

BayesianOptimization을 이용하여 XGBoost 하이퍼 파라미터를 최적화 하는 튜토리얼 코드를 공유해둔다.

사용한 데이터셋은 링크를 참고하자.

패키지 import

import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

from sklearn.metrics import r2_score

import numpy as np

from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

from bayes_opt import BayesianOptimization

함수 설정

def average_dups(x):
    # Average value of duplicates
    Y.loc[list(x.index)] = Y.loc[list(x.index)].mean()

def xgb_r2_score(preds, dtrain):
    # Courtesy of Tilii
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

def train_xgb(max_depth, subsample, min_child_weight, gamma, colsample_bytree):
    # Evaluate an XGBoost model using given params
    xgb_params = {
        'n_trees': 250,
        'eta': 0.01,
        'max_depth': int(max_depth),
        'subsample': max(min(subsample, 1), 0),
        'objective': 'reg:linear',
        'base_score': np.mean(Y), # base prediction = mean(target)
        'silent': 1,
        'min_child_weight': int(min_child_weight),
        'gamma': max(gamma, 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0)
    }
    scores = xgb.cv(xgb_params, dtrain, num_boost_round=1500, early_stopping_rounds=50, verbose_eval=False, feval=xgb_r2_score, maximize=True, nfold=5)['test-r2-mean'].iloc[-1]
    return scores

데이터 로드

train = pd.read_csv("../input/train_benz.csv")
test = pd.read_csv("../input/test_benz.csv")

label 처리

for c in train.columns:
    if train[c].dtype == 'object':

        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

X, Y 정의

# Organize our data for training
X = train.drop(["y"], axis=1)
Y = train["y"]
X_Test = test

Feature Engineering

# Handling duplicate values
# First we group the duplicates and then average them
dups = X[X.duplicated(keep=False)]
dups.groupby(dups.columns.tolist()).apply(average_dups)

# Drop duplicates keeping only 1 instance of each group
train.drop(X[X.duplicated()].index.values, axis=0, inplace=True)
X = train.drop(["y"], axis=1)
Y = train["y"]

# Fix index after dropping
X.reset_index(inplace=True, drop=True)
Y.reset_index(inplace=True, drop=True)

# Handling outliers
# Y[Y > 150] = Y.quantile(0.99)


pca = PCA(n_components=5)
ica = FastICA(n_components=5, max_iter=1000)
tsvd = TruncatedSVD(n_components=5)
gp = GaussianRandomProjection(n_components=5)
sp = SparseRandomProjection(n_components=5, dense_output=True)

x_pca = pd.DataFrame(pca.fit_transform(X))
x_ica = pd.DataFrame(ica.fit_transform(X))
x_tsvd = pd.DataFrame(tsvd.fit_transform(X))
x_gp = pd.DataFrame(gp.fit_transform(X))
x_sp = pd.DataFrame(sp.fit_transform(X))

x_pca.columns = ["pca_{}".format(i) for i in x_pca.columns]
x_ica.columns = ["ica_{}".format(i) for i in x_ica.columns]
x_tsvd.columns = ["tsvd_{}".format(i) for i in x_tsvd.columns]
x_gp.columns = ["gp_{}".format(i) for i in x_gp.columns]
x_sp.columns = ["sp_{}".format(i) for i in x_sp.columns]

X = pd.concat((X, x_pca), axis=1)
X = pd.concat((X, x_ica), axis=1)
X = pd.concat((X, x_tsvd), axis=1)
X = pd.concat((X, x_gp), axis=1)
X = pd.concat((X, x_sp), axis=1)

x_test_pca = pd.DataFrame(pca.transform(X_Test))
x_test_ica = pd.DataFrame(ica.transform(X_Test))
x_test_tsvd = pd.DataFrame(tsvd.transform(X_Test))
x_test_gp = pd.DataFrame(gp.transform(X_Test))
x_test_sp = pd.DataFrame(sp.transform(X_Test))

x_test_pca.columns = ["pca_{}".format(i) for i in x_test_pca.columns]
x_test_ica.columns = ["ica_{}".format(i) for i in x_test_ica.columns]
x_test_tsvd.columns = ["tsvd_{}".format(i) for i in x_test_tsvd.columns]
x_test_gp.columns = ["gp_{}".format(i) for i in x_test_gp.columns]
x_test_sp.columns = ["sp_{}".format(i) for i in x_test_sp.columns]


X_Test = pd.concat((X_Test, x_test_pca), axis=1)
X_Test = pd.concat((X_Test, x_test_ica), axis=1)
X_Test = pd.concat((X_Test, x_test_tsvd), axis=1)
X_Test = pd.concat((X_Test, x_test_gp), axis=1)
X_Test = pd.concat((X_Test, x_test_sp), axis=1)

dtrain = xgb.DMatrix(X, Y)
dtest = xgb.DMatrix(X_Test)

하이퍼 파라미터 후보군 설정

# A parameter grid for XGBoost
params = {
  'min_child_weight':(1, 20),
  'gamma':(0, 10),
  'subsample':(0.5, 1),
  'colsample_bytree':(0.1, 1),
  'max_depth': (2, 10)
}

BayesianOptimization 수행

# Initialize BO optimizer
xgb_bayesopt = BayesianOptimization(train_xgb, params)

# Maximize R2 score
xgb_bayesopt.maximize(init_points=5, n_iter=25)

결과값은 아래와 같다.

Initialization
---------------------------------------------------------------------------------------------------------------
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample |
    1 | 00m00s |    0.00353 |             0.1067 |    5.9702 |      4.9732 |             7.9105 |      0.5793 |
    2 | 00m02s |    0.00943 |             0.5810 |    8.1688 |      8.5239 |             6.7358 |      0.6484 |
    3 | 00m02s |    0.00939 |             0.5858 |    0.3428 |      7.3622 |            12.2737 |      0.8497 |
    4 | 00m01s |    0.00942 |             0.6780 |    6.4580 |      4.8621 |             4.3302 |      0.9020 |
    5 | 00m01s |    0.00937 |             0.6267 |    5.9666 |      3.6477 |             3.9127 |      0.8826 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([0.00023363]), 'nit': 4, 'funcalls': 49}
  " state: %s" % convergence_dict)


Bayesian Optimization
---------------------------------------------------------------------------------------------------------------
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample |
    6 | 00m10s |    0.00645 |             0.1168 |    0.0585 |      9.9426 |             1.6162 |      0.6219 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-3.99594403e-05]), 'nit': 5, 'funcalls': 59}
  " state: %s" % convergence_dict)


    7 | 00m13s |    0.00934 |             0.9623 |    9.9661 |      9.9366 |            19.9755 |      0.5912 |
    8 | 00m11s |    0.00815 |             0.9904 |    0.2751 |      2.1226 |            19.9564 |      0.6543 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-0.00011204]), 'nit': 4, 'funcalls': 48}
  " state: %s" % convergence_dict)


    9 | 00m20s |    0.00940 |             0.9913 |    0.2288 |      9.9603 |            19.8633 |      0.7310 |
   10 | 00m17s |    0.00805 |             0.9976 |    0.2792 |      2.3768 |             1.8580 |      0.5414 |
   11 | 00m17s |    0.00789 |             0.2153 |    9.8183 |      9.5261 |             1.0594 |      0.9613 |
   12 | 00m16s |    0.00794 |             0.9883 |    9.8348 |      2.1611 |            12.6251 |      0.9193 |
   13 | 00m18s |    0.00806 |             0.9637 |    9.7190 |      2.1983 |             1.0225 |      0.5885 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-0.00049384]), 'nit': 3, 'funcalls': 46}
  " state: %s" % convergence_dict)


   14 | 00m20s |    0.00949 |             0.9592 |    3.6529 |      9.9578 |             6.6313 |      0.9456 |
   15 | 00m18s |    0.00885 |             0.3171 |    9.8643 |      9.8502 |            12.5730 |      0.9990 |
   16 | 00m17s |    0.00799 |             0.9798 |    0.0707 |      2.3295 |            10.2960 |      0.8780 |
   17 | 00m16s |    0.00799 |             0.8820 |    9.7323 |      2.2531 |            19.8410 |      0.7622 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-1.46401944e-05]), 'nit': 5, 'funcalls': 60}
  " state: %s" % convergence_dict)


   18 | 00m19s |    0.00941 |             0.8950 |    9.7538 |      9.8350 |             6.2464 |      0.9611 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-4.38423312e-05]), 'nit': 5, 'funcalls': 53}
  " state: %s" % convergence_dict)


   19 | 00m20s |    0.00951 |             0.9958 |    5.4073 |      6.3263 |            17.7431 |      0.9789 |
   20 | 00m21s |    0.00927 |             0.9379 |    6.1260 |      9.5142 |             1.0487 |      0.5359 |
   21 | 00m21s |    0.00933 |             0.9466 |    2.0336 |      9.9929 |            14.4140 |      0.5022 |
   22 | 00m18s |    0.00776 |             0.5678 |    3.9906 |      2.0586 |             1.0376 |      0.9934 |
   23 | 00m21s |    0.00938 |             0.9763 |    9.8840 |      6.0286 |            16.0229 |      0.5409 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([3.84756859e-05]), 'nit': 7, 'funcalls': 54}
  " state: %s" % convergence_dict)


   24 | 00m20s |    0.00949 |             0.9650 |    0.0155 |      7.4628 |            17.0025 |      0.9130 |
   25 | 00m20s |    0.00945 |             0.9525 |    0.0655 |      9.8559 |             9.4017 |      0.9323 |
   26 | 00m19s |    0.00724 |             0.1331 |    4.8099 |      9.9214 |            19.8350 |      0.8942 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([0.00027265]), 'nit': 3, 'funcalls': 46}
  " state: %s" % convergence_dict)


   27 | 00m22s |    0.00947 |             0.9853 |    0.0610 |      6.6800 |             5.7583 |      0.5535 |


/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-0.00013131]), 'nit': 6, 'funcalls': 49}
  " state: %s" % convergence_dict)
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the  state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([3.13412165e-05]), 'nit': 3, 'funcalls': 46}
  " state: %s" % convergence_dict)


   28 | 00m20s |    0.00949 |             0.9225 |    9.9982 |      6.7372 |            19.7594 |      0.9578 |
   29 | 00m26s |    0.00944 |             0.9944 |    8.7620 |      9.9011 |            15.7535 |      0.9037 |
   30 | 00m19s |    0.00804 |             0.9735 |    9.9194 |      2.1628 |             4.1877 |      0.9736 |

최적의 하이퍼 파라미터 확인

# Get the best params
p = xgb_bayesopt.res['max']['max_params']
print(p)

확인해보면 아래와 같이 나온다.

{'subsample': 0.9789123365350143, 'max_depth': 6.326297237821378, 'gamma': 5.407321151452637, 'min_child_weight': 17.74306822108236, 'colsample_bytree': 0.9957738074455033}

학습 진행

위에서 얻은 최적의 하이퍼 파라미터로 학습을 진행한다.

xgb_params = {
    'n_trees': 250,
    'eta': 0.01,
    'max_depth': int(p['max_depth']),
    'subsample': max(min(p['subsample'], 1), 0),
    'objective': 'reg:linear',
    'base_score': np.mean(Y), # base prediction = mean(target)
    'silent': 1,
    'min_child_weight': int(p['min_child_weight']),
    'gamma': max(p['gamma'], 0),
    'colsample_bytree': max(min(p['colsample_bytree'], 1), 0)
}

model = xgb.train(xgb_params, dtrain, num_boost_round=1500, verbose_eval=False, feval=xgb_r2_score, maximize=True)

Y_Test = model.predict(dtest)

results_df = pd.DataFrame(data={'y':Y_Test})
ids = test["ID"]
joined = pd.DataFrame(ids).join(results_df)
joined.head()
ID y
0 1 80.332069
1 2 97.720398
2 3 82.628181
3 4 82.749779
4 5 117.250488

Snippet

짧은 스니펫은 아래와 같다.

import xgboost as xgb

dtrain = xgb.DMatrix(train_X, train_Y)

def train_xgb(max_depth, subsample, min_child_weight, gamma, colsample_bytree):
    # Evaluate an XGBoost model using given params
    xgb_params = {
        'n_trees': 400,
        'max_depth': int(max_depth),
        'subsample': max(min(subsample, 1), 0),
        'silent': 1,
        'gamma': max(gamma, 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'tree_method':'gpu_hist'
    }
    scores = xgb.cv(xgb_params, dtrain, num_boost_round=1500, verbose_eval=False, maximize=True, nfold=5)['test-rmse-mean'].iloc[-1]
    return scores

params = {
  'min_child_weight':(1, 20),
  'gamma':(0, 10),
  'subsample':(0.5, 1),
  'colsample_bytree':(0.1, 1),
  'max_depth': (2, 10)
}

# Initialize BO optimizer
xgb_bayesopt = BayesianOptimization(train_xgb, params)

# Maximize R2 score
xgb_bayesopt.maximize(init_points=5, n_iter=25)