Tabular Data Binary Classification: All Tips and Tricks from 5 Kaggle Competitions
Tabular Data Binary Classification: All Tips and Tricks from 5 Kaggle Competitions - neptune.ai
In this article, I will discuss some great tips and tricks to improve the performance of your structured data binary classification model. These tricks are obtained from solutions of some of Kaggle’s top tabular data competitions. Without much lag, let
neptune.ai
1. 큰 데이터 셋 다루기
여기서 첫째 항목은 별로인 것 같고,
두번째 항목은 중요해 보인다.
이런 코드를 예전에 본 적이 있었는데 출처가 캐글이었다.
# 출처 : 카일스쿨 / https://zzsza.github.io/kyle-school/week2/#/3/9
def reduce_mem_usage(df):
"""
iterate through all the columns of a dataframe and
modify the data type to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print(f'Memory usage of dataframe is {start_mem:.2f}MB')
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max <\
np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max <\
np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max <\
np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max <\
np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
elif str(col_type)[:5] == 'float':
if c_min > np.finfo(np.float16).min and c_max <\
np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max <\
np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
pass
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print(f'Memory usage after optimization is: {end_mem:.2f}MB')
print(f'Decreased by {100*((start_mem - end_mem)/start_mem):.1f}%')
return df
( 아래는 두번째 링크(캐글)의 데이터 형을 바꿔서 메모리를 절약하는 코드 )
INT8_MIN = np.iinfo(np.int8).min
INT8_MAX = np.iinfo(np.int8).max
INT16_MIN = np.iinfo(np.int16).min
INT16_MAX = np.iinfo(np.int16).max
INT32_MIN = np.iinfo(np.int32).min
INT32_MAX = np.iinfo(np.int32).max
FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max
def memory_usage(data, detail=1):
if detail:
display(data.memory_usage())
memory = data.memory_usage().sum() / (1024*1024)
print("Memory usage : {0:.2f}MB".format(memory))
return memory
def compress_dataset(data):
"""
Compress datatype as small as it can
Parameters
----------
path: pandas Dataframe
Returns
-------
None
"""
memory_before_compress = memory_usage(data, 0)
print()
length_interval = 50
length_float_decimal = 4
print('='*length_interval)
for col in data.columns:
col_dtype = data[col][:100].dtype
if col_dtype != 'object':
print("Name: {0:24s} Type: {1}".format(col, col_dtype))
col_series = data[col]
col_min = col_series.min()
col_max = col_series.max()
if col_dtype == 'float64':
print(" variable min: {0:15s} max: {1:15s}".format(str(np.round(col_min, length_float_decimal)), str(np.round(col_max, length_float_decimal))))
if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
data[col] = data[col].astype(np.float16)
print(" float16 min: {0:15s} max: {1:15s}".format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
print("compress float64 --> float16")
elif (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
data[col] = data[col].astype(np.float32)
print(" float32 min: {0:15s} max: {1:15s}".format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
print("compress float64 --> float32")
else:
pass
memory_after_compress = memory_usage(data, 0)
print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
print('='*length_interval)
if col_dtype == 'int64':
print(" variable min: {0:15s} max: {1:15s}".format(str(col_min), str(col_max)))
type_flag = 64
if (col_min > INT8_MIN/2) and (col_max < INT8_MAX/2):
type_flag = 8
data[col] = data[col].astype(np.int8)
print(" int8 min: {0:15s} max: {1:15s}".format(str(INT8_MIN), str(INT8_MAX)))
elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
type_flag = 16
data[col] = data[col].astype(np.int16)
print(" int16 min: {0:15s} max: {1:15s}".format(str(INT16_MIN), str(INT16_MAX)))
elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
type_flag = 32
data[col] = data[col].astype(np.int32)
print(" int32 min: {0:15s} max: {1:15s}".format(str(INT32_MIN), str(INT32_MAX)))
type_flag = 1
else:
pass
memory_after_compress = memory_usage(data, 0)
print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
if type_flag == 32:
print("compress (int64) ==> (int32)")
elif type_flag == 16:
print("compress (int64) ==> (int16)")
else:
print("compress (int64) ==> (int8)")
print('='*length_interval)
print()
memory_after_compress = memory_usage(data, 0)
print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
이러한 코드를 봤을떄 가장 먼저 든 생각은 아래 글이었다.
2. EDA
- 캐글 사례들을 소개한다. 그런데 만약에 유사한 도메인의 문제를 푸는게 아니라면 어떤 인사이트를 얻을 수 있는지 모르겠다.
- 차라리 sweetviz 처럼 시각화를 돕는 라이브러리를 쓰는게 낫지 않을까? 싶다.
3.
소주제와 별개로 이 커널은 상당히 흥미롭다.
- oof 를 아주 지독하게 했다. 그리고 결과 EDA를 한다.
- https://www.kaggle.com/nawidsayed/lightgbm-and-cnn-3rd-place-solution/notebook
LightGBM and CNN 3rd place solution
Explore and run machine learning code with Kaggle Notebooks | Using data from multiple data sources
www.kaggle.com
GPU LGBM은 처음 보네. ( GPU XGboost 는 봤었다. 그런데 CPU 대비 속도가 그닥 빠르지 않다는 글을 봐서 GPU를 썼을때 얼마나 빨라지는지 궁금하다. )
- https://www.kaggle.com/nicapotato/gpyopt-hyperparameter-optimisation-gpu-lgbm
Gpyopt Hyperparameter Optimisation - GPU LGBM
Explore and run machine learning code with Kaggle Notebooks | Using data from IEEE-CIS Fraud Detection
www.kaggle.com
하이퍼파라미터별 성능 비교가 인상적
...
gpyopt_output.append(
[
loss,
experiment_lgb.best_score['train'][metric],
score,
experiment_lgb.best_iteration,
params,
runtime
]
...
...
results = pd.DataFrame(gpyopt_output,
columns = ['logloss','train_auc','valid_auc',
'boosting_rounds','parameters', 'runtime']
)
...
t_r,t_c = 3, 4
f, axes = plt.subplots(t_r, t_c, figsize = [15,12],
sharex=False, sharey=False)
row,col = 0,0
paras = ['num_leaves', 'subsample_for_bin', 'min_sum_hessian_in_leaf','reg_alpha',
'reg_lambda', 'bagging_fraction','feature_fraction','min_data_in_leaf',
'boosting_rounds', 'runtime', 'logloss','train_auc']
for var in paras:
if col == 4:
col = 0
row += 1 # 단순히 3 x 3 이미지르 ㄹ보여주고 싶어서이다.
# Plot
# sns의 regplot 이라는게 있구나.
sns.regplot(x=var, y = "valid_auc", data = results,
x_estimator=np.mean, logx=True,
truncate=True, ax = axes[row,col])
axes[row,col].set_title('{} vs AUC'.format(var.title()))
axes[row,col].grid(True, lw = 2, ls = '--', c = '.75')
# My last plot has a waky x limit..
axes[row,col].set_ylim(results.valid_auc.min(),results.valid_auc.max())
if var == paras[-1]:
axes[row,col].set_xlim(.90,1)
col+=1
plt.tight_layout(pad=0)
plt.show()