개요
BayesianOptimization을 이용하여 XGBoost 하이퍼 파라미터를 최적화 하는 튜토리얼 코드를 공유해둔다.
사용한 데이터셋은 링크를 참고하자.
패키지 import
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import r2_score
import numpy as np
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from bayes_opt import BayesianOptimization
함수 설정
def average_dups(x):
# Average value of duplicates
Y.loc[list(x.index)] = Y.loc[list(x.index)].mean()
def xgb_r2_score(preds, dtrain):
# Courtesy of Tilii
labels = dtrain.get_label()
return 'r2', r2_score(labels, preds)
def train_xgb(max_depth, subsample, min_child_weight, gamma, colsample_bytree):
# Evaluate an XGBoost model using given params
xgb_params = {
'n_trees': 250,
'eta': 0.01,
'max_depth': int(max_depth),
'subsample': max(min(subsample, 1), 0),
'objective': 'reg:linear',
'base_score': np.mean(Y), # base prediction = mean(target)
'silent': 1,
'min_child_weight': int(min_child_weight),
'gamma': max(gamma, 0),
'colsample_bytree': max(min(colsample_bytree, 1), 0)
}
scores = xgb.cv(xgb_params, dtrain, num_boost_round=1500, early_stopping_rounds=50, verbose_eval=False, feval=xgb_r2_score, maximize=True, nfold=5)['test-r2-mean'].iloc[-1]
return scores
데이터 로드
label 처리
for c in train.columns:
if train[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(train[c].values) + list(test[c].values))
train[c] = lbl.transform(list(train[c].values))
test[c] = lbl.transform(list(test[c].values))
X, Y 정의
Feature Engineering
# Handling duplicate values
# First we group the duplicates and then average them
dups = X[X.duplicated(keep=False)]
dups.groupby(dups.columns.tolist()).apply(average_dups)
# Drop duplicates keeping only 1 instance of each group
train.drop(X[X.duplicated()].index.values, axis=0, inplace=True)
X = train.drop(["y"], axis=1)
Y = train["y"]
# Fix index after dropping
X.reset_index(inplace=True, drop=True)
Y.reset_index(inplace=True, drop=True)
# Handling outliers
# Y[Y > 150] = Y.quantile(0.99)
pca = PCA(n_components=5)
ica = FastICA(n_components=5, max_iter=1000)
tsvd = TruncatedSVD(n_components=5)
gp = GaussianRandomProjection(n_components=5)
sp = SparseRandomProjection(n_components=5, dense_output=True)
x_pca = pd.DataFrame(pca.fit_transform(X))
x_ica = pd.DataFrame(ica.fit_transform(X))
x_tsvd = pd.DataFrame(tsvd.fit_transform(X))
x_gp = pd.DataFrame(gp.fit_transform(X))
x_sp = pd.DataFrame(sp.fit_transform(X))
x_pca.columns = ["pca_{}".format(i) for i in x_pca.columns]
x_ica.columns = ["ica_{}".format(i) for i in x_ica.columns]
x_tsvd.columns = ["tsvd_{}".format(i) for i in x_tsvd.columns]
x_gp.columns = ["gp_{}".format(i) for i in x_gp.columns]
x_sp.columns = ["sp_{}".format(i) for i in x_sp.columns]
X = pd.concat((X, x_pca), axis=1)
X = pd.concat((X, x_ica), axis=1)
X = pd.concat((X, x_tsvd), axis=1)
X = pd.concat((X, x_gp), axis=1)
X = pd.concat((X, x_sp), axis=1)
x_test_pca = pd.DataFrame(pca.transform(X_Test))
x_test_ica = pd.DataFrame(ica.transform(X_Test))
x_test_tsvd = pd.DataFrame(tsvd.transform(X_Test))
x_test_gp = pd.DataFrame(gp.transform(X_Test))
x_test_sp = pd.DataFrame(sp.transform(X_Test))
x_test_pca.columns = ["pca_{}".format(i) for i in x_test_pca.columns]
x_test_ica.columns = ["ica_{}".format(i) for i in x_test_ica.columns]
x_test_tsvd.columns = ["tsvd_{}".format(i) for i in x_test_tsvd.columns]
x_test_gp.columns = ["gp_{}".format(i) for i in x_test_gp.columns]
x_test_sp.columns = ["sp_{}".format(i) for i in x_test_sp.columns]
X_Test = pd.concat((X_Test, x_test_pca), axis=1)
X_Test = pd.concat((X_Test, x_test_ica), axis=1)
X_Test = pd.concat((X_Test, x_test_tsvd), axis=1)
X_Test = pd.concat((X_Test, x_test_gp), axis=1)
X_Test = pd.concat((X_Test, x_test_sp), axis=1)
dtrain = xgb.DMatrix(X, Y)
dtest = xgb.DMatrix(X_Test)
하이퍼 파라미터 후보군 설정
# A parameter grid for XGBoost
params = {
'min_child_weight':(1, 20),
'gamma':(0, 10),
'subsample':(0.5, 1),
'colsample_bytree':(0.1, 1),
'max_depth': (2, 10)
}
BayesianOptimization 수행
# Initialize BO optimizer
xgb_bayesopt = BayesianOptimization(train_xgb, params)
# Maximize R2 score
xgb_bayesopt.maximize(init_points=5, n_iter=25)
결과값은 아래와 같다.
[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
Step | Time | Value | colsample_bytree | gamma | max_depth | min_child_weight | subsample |
1 | 00m00s | [35m 0.00353[0m | [32m 0.1067[0m | [32m 5.9702[0m | [32m 4.9732[0m | [32m 7.9105[0m | [32m 0.5793[0m |
2 | 00m02s | [35m 0.00943[0m | [32m 0.5810[0m | [32m 8.1688[0m | [32m 8.5239[0m | [32m 6.7358[0m | [32m 0.6484[0m |
3 | 00m02s | 0.00939 | 0.5858 | 0.3428 | 7.3622 | 12.2737 | 0.8497 |
4 | 00m01s | 0.00942 | 0.6780 | 6.4580 | 4.8621 | 4.3302 | 0.9020 |
5 | 00m01s | 0.00937 | 0.6267 | 5.9666 | 3.6477 | 3.9127 | 0.8826 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([0.00023363]), 'nit': 4, 'funcalls': 49}
" state: %s" % convergence_dict)
[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
Step | Time | Value | colsample_bytree | gamma | max_depth | min_child_weight | subsample |
6 | 00m10s | 0.00645 | 0.1168 | 0.0585 | 9.9426 | 1.6162 | 0.6219 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-3.99594403e-05]), 'nit': 5, 'funcalls': 59}
" state: %s" % convergence_dict)
7 | 00m13s | 0.00934 | 0.9623 | 9.9661 | 9.9366 | 19.9755 | 0.5912 |
8 | 00m11s | 0.00815 | 0.9904 | 0.2751 | 2.1226 | 19.9564 | 0.6543 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-0.00011204]), 'nit': 4, 'funcalls': 48}
" state: %s" % convergence_dict)
9 | 00m20s | 0.00940 | 0.9913 | 0.2288 | 9.9603 | 19.8633 | 0.7310 |
10 | 00m17s | 0.00805 | 0.9976 | 0.2792 | 2.3768 | 1.8580 | 0.5414 |
11 | 00m17s | 0.00789 | 0.2153 | 9.8183 | 9.5261 | 1.0594 | 0.9613 |
12 | 00m16s | 0.00794 | 0.9883 | 9.8348 | 2.1611 | 12.6251 | 0.9193 |
13 | 00m18s | 0.00806 | 0.9637 | 9.7190 | 2.1983 | 1.0225 | 0.5885 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-0.00049384]), 'nit': 3, 'funcalls': 46}
" state: %s" % convergence_dict)
14 | 00m20s | [35m 0.00949[0m | [32m 0.9592[0m | [32m 3.6529[0m | [32m 9.9578[0m | [32m 6.6313[0m | [32m 0.9456[0m |
15 | 00m18s | 0.00885 | 0.3171 | 9.8643 | 9.8502 | 12.5730 | 0.9990 |
16 | 00m17s | 0.00799 | 0.9798 | 0.0707 | 2.3295 | 10.2960 | 0.8780 |
17 | 00m16s | 0.00799 | 0.8820 | 9.7323 | 2.2531 | 19.8410 | 0.7622 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-1.46401944e-05]), 'nit': 5, 'funcalls': 60}
" state: %s" % convergence_dict)
18 | 00m19s | 0.00941 | 0.8950 | 9.7538 | 9.8350 | 6.2464 | 0.9611 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-4.38423312e-05]), 'nit': 5, 'funcalls': 53}
" state: %s" % convergence_dict)
19 | 00m20s | [35m 0.00951[0m | [32m 0.9958[0m | [32m 5.4073[0m | [32m 6.3263[0m | [32m 17.7431[0m | [32m 0.9789[0m |
20 | 00m21s | 0.00927 | 0.9379 | 6.1260 | 9.5142 | 1.0487 | 0.5359 |
21 | 00m21s | 0.00933 | 0.9466 | 2.0336 | 9.9929 | 14.4140 | 0.5022 |
22 | 00m18s | 0.00776 | 0.5678 | 3.9906 | 2.0586 | 1.0376 | 0.9934 |
23 | 00m21s | 0.00938 | 0.9763 | 9.8840 | 6.0286 | 16.0229 | 0.5409 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([3.84756859e-05]), 'nit': 7, 'funcalls': 54}
" state: %s" % convergence_dict)
24 | 00m20s | 0.00949 | 0.9650 | 0.0155 | 7.4628 | 17.0025 | 0.9130 |
25 | 00m20s | 0.00945 | 0.9525 | 0.0655 | 9.8559 | 9.4017 | 0.9323 |
26 | 00m19s | 0.00724 | 0.1331 | 4.8099 | 9.9214 | 19.8350 | 0.8942 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([0.00027265]), 'nit': 3, 'funcalls': 46}
" state: %s" % convergence_dict)
27 | 00m22s | 0.00947 | 0.9853 | 0.0610 | 6.6800 | 5.7583 | 0.5535 |
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([-0.00013131]), 'nit': 6, 'funcalls': 49}
" state: %s" % convergence_dict)
/home/lks21c/py35/lib/python3.5/site-packages/sklearn/gaussian_process/gpr.py:457: UserWarning: fmin_l_bfgs_b terminated abnormally with the state: {'warnflag': 2, 'task': b'ABNORMAL_TERMINATION_IN_LNSRCH', 'grad': array([3.13412165e-05]), 'nit': 3, 'funcalls': 46}
" state: %s" % convergence_dict)
28 | 00m20s | 0.00949 | 0.9225 | 9.9982 | 6.7372 | 19.7594 | 0.9578 |
29 | 00m26s | 0.00944 | 0.9944 | 8.7620 | 9.9011 | 15.7535 | 0.9037 |
30 | 00m19s | 0.00804 | 0.9735 | 9.9194 | 2.1628 | 4.1877 | 0.9736 |
최적의 하이퍼 파라미터 확인
확인해보면 아래와 같이 나온다.
{'subsample': 0.9789123365350143, 'max_depth': 6.326297237821378, 'gamma': 5.407321151452637, 'min_child_weight': 17.74306822108236, 'colsample_bytree': 0.9957738074455033}
학습 진행
위에서 얻은 최적의 하이퍼 파라미터로 학습을 진행한다.
xgb_params = {
'n_trees': 250,
'eta': 0.01,
'max_depth': int(p['max_depth']),
'subsample': max(min(p['subsample'], 1), 0),
'objective': 'reg:linear',
'base_score': np.mean(Y), # base prediction = mean(target)
'silent': 1,
'min_child_weight': int(p['min_child_weight']),
'gamma': max(p['gamma'], 0),
'colsample_bytree': max(min(p['colsample_bytree'], 1), 0)
}
model = xgb.train(xgb_params, dtrain, num_boost_round=1500, verbose_eval=False, feval=xgb_r2_score, maximize=True)
Y_Test = model.predict(dtest)
results_df = pd.DataFrame(data={'y':Y_Test})
ids = test["ID"]
joined = pd.DataFrame(ids).join(results_df)
joined.head()
ID | y | |
---|---|---|
0 | 1 | 80.332069 |
1 | 2 | 97.720398 |
2 | 3 | 82.628181 |
3 | 4 | 82.749779 |
4 | 5 | 117.250488 |
Snippet
짧은 스니펫은 아래와 같다.
import xgboost as xgb
dtrain = xgb.DMatrix(train_X, train_Y)
def train_xgb(max_depth, subsample, min_child_weight, gamma, colsample_bytree):
# Evaluate an XGBoost model using given params
xgb_params = {
'n_trees': 400,
'max_depth': int(max_depth),
'subsample': max(min(subsample, 1), 0),
'silent': 1,
'gamma': max(gamma, 0),
'colsample_bytree': max(min(colsample_bytree, 1), 0),
'tree_method':'gpu_hist'
}
scores = xgb.cv(xgb_params, dtrain, num_boost_round=1500, verbose_eval=False, maximize=True, nfold=5)['test-rmse-mean'].iloc[-1]
return scores
params = {
'min_child_weight':(1, 20),
'gamma':(0, 10),
'subsample':(0.5, 1),
'colsample_bytree':(0.1, 1),
'max_depth': (2, 10)
}
# Initialize BO optimizer
xgb_bayesopt = BayesianOptimization(train_xgb, params)
# Maximize R2 score
xgb_bayesopt.maximize(init_points=5, n_iter=25)