Portfolio & Project/Project in Competition

Tabular Playground Series - Sep 2021_Modeling

맨사설 2021. 9. 15. 20:00
728x90

 

# import basic library
import numpy as np
import pandas as pd
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier

 

# import train & test data
train = pd.read_csv("../tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../tabular-playground-series-sep-2021/test.csv")
sample = pd.read_csv("../tabular-playground-series-sep-2021/sample_solution.csv")

# copy data
X = train.drop(columns=['id','claim']).copy()
y = train['claim'].copy()
test_data = test.drop(columns=['id']).copy()

 

# 데이터 전처리
def get_stats_per_row(data):
    features = [x for x in data.columns.values if x[0]=="f"]
    
    data['max_row'] = data[features].max(axis=1)
    
    data['n_missing'] = data[features].isna().sum(axis=1)
    
    data['min_row'] = data[features].min(axis=1)
       
    data['plus'] = 0

    for feature in features:
        data['plus'] = data[feature] + data['plus']
    
    return data

X = get_stats_per_row(X)
test_data = get_stats_per_row(test_data)
# create preprocessing pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipeline = Pipeline([('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test_data = pd.DataFrame(columns=test_data.columns, data=pipeline.transform(test_data))
# Best parameter
best_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}
%%time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from catboost import CatBoostClassifier

kf = KFold(n_splits=5, shuffle=True, random_state=1)

pred_tmp = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = CatBoostClassifier(**best_params)
    model.fit(X_train, y_train)

    # validation prediction
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('::'*20)
    
    # test prediction
    y_hat = model.predict_proba(test_data)[:,1]
    pred_tmp.append(y_hat)
    
print(f"Overall Validation Score: {np.mean(scores)}")
# average predictions over all folds
predictions = np.mean(np.column_stack(pred_tmp),axis=1)

# create submission file
sample_solution['claim'] = predictions
sample_solution.to_csv('./catb_baseline.csv', index=False)

 

728x90