728x90
# import basic library
import numpy as np
import pandas as pd
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
# import train & test data
train = pd.read_csv("../tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../tabular-playground-series-sep-2021/test.csv")
sample = pd.read_csv("../tabular-playground-series-sep-2021/sample_solution.csv")
# copy data
X = train.drop(columns=['id','claim']).copy()
y = train['claim'].copy()
test_data = test.drop(columns=['id']).copy()
# 데이터 전처리
def get_stats_per_row(data):
features = [x for x in data.columns.values if x[0]=="f"]
data['max_row'] = data[features].max(axis=1)
data['n_missing'] = data[features].isna().sum(axis=1)
data['min_row'] = data[features].min(axis=1)
data['plus'] = 0
for feature in features:
data['plus'] = data[feature] + data['plus']
return data
X = get_stats_per_row(X)
test_data = get_stats_per_row(test_data)
# create preprocessing pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
pipeline = Pipeline([('impute', SimpleImputer(strategy='median')),
('scale', StandardScaler())
])
X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test_data = pd.DataFrame(columns=test_data.columns, data=pipeline.transform(test_data))
# Best parameter
best_params = {
'iterations': 15585,
'objective': 'CrossEntropy',
'bootstrap_type': 'Bernoulli',
'od_wait': 1144,
'learning_rate': 0.023575206684596582,
'reg_lambda': 36.30433203563295,
'random_strength': 43.75597655616195,
'depth': 7,
'min_data_in_leaf': 11,
'leaf_estimation_iterations': 1,
'subsample': 0.8227911142845009,
'task_type' : 'GPU',
'devices' : '0',
'verbose' : 0
}
%%time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from catboost import CatBoostClassifier
kf = KFold(n_splits=5, shuffle=True, random_state=1)
pred_tmp = []
scores = []
for fold, (idx_train, idx_valid) in enumerate(kf.split(X)):
X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
model = CatBoostClassifier(**best_params)
model.fit(X_train, y_train)
# validation prediction
pred_valid = model.predict_proba(X_valid)[:,1]
fpr, tpr, _ = roc_curve(y_valid, pred_valid)
score = auc(fpr, tpr)
scores.append(score)
print(f"Fold: {fold + 1} Score: {score}")
print('::'*20)
# test prediction
y_hat = model.predict_proba(test_data)[:,1]
pred_tmp.append(y_hat)
print(f"Overall Validation Score: {np.mean(scores)}")
# average predictions over all folds
predictions = np.mean(np.column_stack(pred_tmp),axis=1)
# create submission file
sample_solution['claim'] = predictions
sample_solution.to_csv('./catb_baseline.csv', index=False)
728x90
'Portfolio & Project > Project in Competition' 카테고리의 다른 글
[데이콘] 제2회 코스포 x 데이콘 도서 추천 알고리즘 AI경진대회(PDF) (0) | 2023.06.09 |
---|---|
[데이콘] 제2회 코스포 x 데이콘 도서 추천 알고리즘 AI경진대회(코드) (0) | 2023.05.08 |
Tabular Playground Series - Sep 2021_EDA (0) | 2021.09.14 |