개요

Jupyter Notebook에서 EDA 혹은 머신러닝 개발을 진행하는데 있어 깨알같고 꿀같은 Tip들을 경험적으로 정리하여 공유한다.

Pandas 출력 칼럼 Column 늘이기

import pandas as pd
pd.set_option('display.max_columns', 500)

Matplot 보이게 설정

import matplotlib.pyplot as plt
%matplotlib inline

Pandas 칼럼 존재 여부 판단

if 'image_top_1' in df.columns:        
  df['image_top_1'].fillna(train['image_top_1'].median(), inplace=True)

Pandas로 문자열 여부를 int 칼럼으로 변환

df['image_exists'] = df['image'].isnull().astype(int)

Pandas로 na값 제거

df = df.dropna(subset=['price'])

Pandas로 na값 채우기

df['image_top_1'].fillna(train['image_top_1'].median(), inplace=True)

Pandas로 na,null 체크

pd.isna(train_X).sum()
pd.isnull(train_X).sum()

Pandas로 inf값 변환

df['price'] = df['price'].replace([np.inf, -np.inf,np.NaN], 0)

카테고리 Feature Hash 변환

def n_hash(s):
    random.seed(hash(s))
    return random.random()

def hash_column(row, col):
    if col in row:
        return n_hash(row[col])
    return n_hash('none')

train['user_hash'] = train.apply(lambda row: hash_column(row, 'user_id'), axis=1)

시계열 Feature 변환

df.activation_date = pd.to_datetime(df.activation_date)
df['day_of_month'] = df.activation_date.apply(lambda x: x.day)
df['day_of_week'] = df.activation_date.apply(lambda x: x.weekday())

pandas Profile

import pandas_profiling as pp
pp.ProfileReport(train_X)

RMSE 예측

pred = model.predict(train_X)
rmse = math.sqrt(mean_squared_error(train_Y, pred))
display(rmse)

XGBRegressor 사용법

model = xgb.XGBRegressor(n_estimators=400, learning_rate=0.05, gamma=0, subsample=0.75, colsample_bytree=1,
                             max_depth=7)
model.fit(train_X, train_Y)

gb = GradientBoostingRegressor()
gb.fit(train_X, train_Y)

GradientBoostingRegressor를 이용한 Feature Importance

gb = GradientBoostingRegressor()
gb.fit(train_X, train_Y)

importances = gb.feature_importances_
std = np.std([gb.feature_importances_],
             axis=0)
indices = np.argsort(importances)

plt.figure()
plt.title("Feature importances")
plt.barh(range(train_X.shape[1]), gb.feature_importances_[indices],
       color="r", xerr=std[indices], align="center")

plt.yticks(range(train_X.shape[1]), indices)
plt.ylim([-1, train_X.shape[1]])
plt.show()

print("Features importance...")
gain = gb.feature_importances_
ft = pd.DataFrame({'feature':train_X.columns.values, 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(ft.head(25))

plt.figure()
ft[['feature','gain']].head(25).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(10, 20))
plt.show()

LightGBM을 이용한 Feature Importance

y = train_Y.as_matrix().ravel()
y.shape
train_data = lightgbm.Dataset(train_X, label=y)
val_data = lightgbm.Dataset(train_X, label=y)

parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 100
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=val_data,
                       num_boost_round=2000,
                       early_stopping_rounds=100)

print("Features importance...")
gain = model.feature_importance('gain')
ft = pd.DataFrame({'feature':model.feature_name(), 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(ft.head(25))

plt.figure()
ft[['feature','gain']].head(25).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(10, 20))
plt.show()