728x90
In [1]:
# 기본 라이브러리 import
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics
from sklearn.metrics import mean_squared_error
In [2]:
# 데이터 불러오기 (train, test, sample_submission)
train= pd.read_csv('C:/Users/20229069/Desktop/superdata/train.csv', encoding='UTF-8')
test= pd.read_csv('C:/Users/20229069/Desktop/superdata/test.csv', encoding='UTF-8')
sample= pd.read_csv('C:/Users/20229069/Desktop/superdata/sample_submission.csv', encoding='UTF-8')
In [3]:
train.head()
Out[3]:
ID | User-ID | Book-ID | Book-Rating | Age | Location | Book-Title | Book-Author | Year-Of-Publication | Publisher | |
---|---|---|---|---|---|---|---|---|---|---|
0 | TRAIN_000000 | USER_00000 | BOOK_044368 | 8 | 23.0 | sackville, new brunswick, canada | Road Taken | Rona Jaffe | 2001.0 | Mira |
1 | TRAIN_000001 | USER_00000 | BOOK_081205 | 8 | 23.0 | sackville, new brunswick, canada | Macbeth (New Penguin Shakespeare) | William Shakespeare | 1981.0 | Penguin Books |
2 | TRAIN_000002 | USER_00000 | BOOK_086781 | 0 | 23.0 | sackville, new brunswick, canada | Waverley (Penguin English Library) | Walter Scott | 1981.0 | Penguin Books |
3 | TRAIN_000003 | USER_00000 | BOOK_098622 | 0 | 23.0 | sackville, new brunswick, canada | Mother Earth Father Sky | Sue Harrison | 1991.0 | Avon |
4 | TRAIN_000004 | USER_00000 | BOOK_180810 | 8 | 23.0 | sackville, new brunswick, canada | She Who Remembers | Linda Lay Shuler | 1989.0 | Signet Book |
In [4]:
test.head()
Out[4]:
ID | User-ID | Book-ID | Age | Location | Book-Title | Book-Author | Year-Of-Publication | Publisher | |
---|---|---|---|---|---|---|---|---|---|
0 | TEST_000000 | USER_00008 | BOOK_047966 | 37.0 | vermilion, ohio, usa | Birds of Prey: A Novel of Suspense | J.A. Jance | 2002.0 | Avon |
1 | TEST_000001 | USER_00008 | BOOK_119494 | 37.0 | vermilion, ohio, usa | Midnight Voices | JOHN SAUL | 2003.0 | Ballantine Books |
2 | TEST_000002 | USER_00008 | BOOK_151775 | 37.0 | vermilion, ohio, usa | Breaking Free : A Prescription for Personal an... | David M. Noer | 1996.0 | Jossey-Bass |
3 | TEST_000003 | USER_00008 | BOOK_176255 | 37.0 | vermilion, ohio, usa | Bitter Harvest | Ann Rule | 1999.0 | |
4 | TEST_000004 | USER_00008 | BOOK_187307 | 37.0 | vermilion, ohio, usa | Embraced by the Light | Betty J. Eadie | 1994.0 | Bantam Books |
In [5]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871393 entries, 0 to 871392
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 871393 non-null object
1 User-ID 871393 non-null object
2 Book-ID 871393 non-null object
3 Book-Rating 871393 non-null int64
4 Age 871393 non-null float64
5 Location 871393 non-null object
6 Book-Title 871393 non-null object
7 Book-Author 871393 non-null object
8 Year-Of-Publication 871393 non-null float64
9 Publisher 871393 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 66.5+ MB
EDA¶
In [6]:
train['Book-Rating'].plot.hist(bins=range(0,10,1),color='y', edgecolor='k', title = 'Book-Rating distribution(Train)')
Out[6]:
<Axes: title={'center': 'Book-Rating distribution(Train)'}, ylabel='Frequency'>
In [7]:
np.log(train['Book-Rating']).plot.hist(bins=range(0,4,1),color='y', edgecolor='k', title = 'Book-Rating distribution(Train)')
C:\Users\20229069\AppData\Local\anaconda3\lib\site-packages\pandas\core\arraylike.py:402: RuntimeWarning: divide by zero encountered in log
result = getattr(ufunc, method)(*inputs, **kwargs)
Out[7]:
<Axes: title={'center': 'Book-Rating distribution(Train)'}, ylabel='Frequency'>
In [8]:
plt.subplot(2, 2, 1)
train['Age'].plot.hist(bins=range(0,100,10),color='c', edgecolor='k', title = 'Age distribution(Train)')
plt.subplot(2, 2, 2)
test['Age'].plot.hist(bins=range(0,100,10),color='c', edgecolor='k', title = 'Age distribution(Test)')
plt.subplot(2, 2, 3)
train['Year-Of-Publication'].plot.hist(bins=range(1950,2020,10), color='orange', edgecolor='k', title = 'Year of publication distribution(Train)')
plt.subplot(2, 2, 4)
test['Year-Of-Publication'].plot.hist(bins=range(1950,2020,10),color='orange', edgecolor='k', title = 'Year of publication distribution(Test)')
plt.tight_layout(
h_pad=3, # 세로 간격 조절
w_pad=4) # 가로 간격 조걸
In [9]:
plt.subplot(2, 2, 1)
et0= train.groupby('User-ID').count()
et0['ID'].plot.hist(bins=range(1,20,1),color='pink', edgecolor='k',title = 'Number of User-ID(Train)')
plt.subplot(2, 2, 2)
et1= test.groupby('User-ID').count()
et1['ID'].plot.hist(bins=range(1,20,1),color='pink', edgecolor='k',title = 'Number of User-ID(Test)')
plt.subplot(2, 2, 3)
et0= train.groupby('Book-ID').count()
et0['ID'].plot.hist(bins=range(1,20,1),color='burlywood', edgecolor='k',title = 'Number of Book-ID(Train)')
plt.subplot(2, 2, 4)
et1= test.groupby('Book-ID').count()
et1['ID'].plot.hist(bins=range(1,20,1),color='burlywood', edgecolor='k',title = 'Number of Book-ID(Test)')
plt.tight_layout(
h_pad=3, # 세로 간격 조절
w_pad=4) # 가로 간격 조걸
In [10]:
# 작가별 count
et = test.groupby('Book-Author').count()
et['name']= et.index
et = et[['name','ID']]
et = et.drop_duplicates(['name'])
# Top 10 추출
et_top10 = et.sort_values(by="ID", ascending=False).head(10)
plt.subplot(2, 1, 1)
sns.barplot(data=et_top10, y="name", x="ID", orient='h', edgecolor='k',color='dodgerblue')
plt.title('Top 10 Author(Train)')
# Book-Title별 count
et = test.groupby('Book-Title').count()
et['name']= et.index
et = et[['name','ID']]
et = et.drop_duplicates(['name'])
# Top 10 추출
et_top10 = et.sort_values(by="ID", ascending=False).head(10)
plt.subplot(2, 1, 2)
sns.barplot(data=et_top10, y="name", x="ID", orient='h', edgecolor='k',color='dodgerblue')
plt.title('Top 10 Book-title(Train)')
plt.tight_layout(
h_pad=2, # 세로 간격 조절
w_pad=0) # 가로 간격 조걸
분석¶
In [11]:
# 나이 구분 컬럼 추가
def func(x) :
if x < 10:
return "00대"
elif x < 20 :
return "10대"
elif x < 30 :
return "20대"
elif x < 40 :
return "30대"
elif x < 50 :
return "40대"
elif x < 60 :
return "50대"
elif x < 70 :
return "60대"
elif x < 80 :
return "70대"
elif x < 90 :
return "80대"
elif x < 100 :
return "90대"
elif x >= 100 :
return "100대"
train["age_gubn"] = train["Age"].apply(lambda x : func(x))
test["age_gubn"] = test["Age"].apply(lambda x : func(x))
In [12]:
# location 컬럼에서 첫번째 변수만 분리 사용 (train & test)
data_list = train['Location'].str.split(',')
train['town'] = data_list.str.get(0)
data_list = test['Location'].str.split(',')
test['town'] = data_list.str.get(0)
In [13]:
# 출판년도컬럼 추가
def func2(x) :
if x == -1:
return "unknown"
elif x < 1960 :
return "1950s"
elif x < 1970 :
return "1960s"
elif x < 1980 :
return "1970s"
elif x < 1990 :
return "1980s"
elif x < 2000 :
return "1990s"
elif x >= 2000 :
return "2000s"
train["Publication_Year_gubn"] = train["Year-Of-Publication"].apply(lambda x : func2(x))
test["Publication_Year_gubn"] = test["Year-Of-Publication"].apply(lambda x : func2(x))
In [14]:
train.head()
Out[14]:
ID | User-ID | Book-ID | Book-Rating | Age | Location | Book-Title | Book-Author | Year-Of-Publication | Publisher | age_gubn | town | Publication_Year_gubn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TRAIN_000000 | USER_00000 | BOOK_044368 | 8 | 23.0 | sackville, new brunswick, canada | Road Taken | Rona Jaffe | 2001.0 | Mira | 20대 | sackville | 2000s |
1 | TRAIN_000001 | USER_00000 | BOOK_081205 | 8 | 23.0 | sackville, new brunswick, canada | Macbeth (New Penguin Shakespeare) | William Shakespeare | 1981.0 | Penguin Books | 20대 | sackville | 1980s |
2 | TRAIN_000002 | USER_00000 | BOOK_086781 | 0 | 23.0 | sackville, new brunswick, canada | Waverley (Penguin English Library) | Walter Scott | 1981.0 | Penguin Books | 20대 | sackville | 1980s |
3 | TRAIN_000003 | USER_00000 | BOOK_098622 | 0 | 23.0 | sackville, new brunswick, canada | Mother Earth Father Sky | Sue Harrison | 1991.0 | Avon | 20대 | sackville | 1990s |
4 | TRAIN_000004 | USER_00000 | BOOK_180810 | 8 | 23.0 | sackville, new brunswick, canada | She Who Remembers | Linda Lay Shuler | 1989.0 | Signet Book | 20대 | sackville | 1980s |
In [15]:
# 각 변수별 평균 평점 추출
train['ID_avg'] = train.groupby(['User-ID']).transform(np.mean)['Book-Rating']
train['book_avg'] = train.groupby(['Book-ID']).transform(np.mean)['Book-Rating']
train['book_author_avg'] = train.groupby(['Book-Author']).transform(np.mean)['Book-Rating']
train['book_publisher_avg'] = train.groupby(['Publisher']).transform(np.mean)['Book-Rating']
train['pyg_avg'] = train.groupby(['Publication_Year_gubn']).transform(np.mean)['Book-Rating']
train['age_gubn_avg'] = train.groupby(['age_gubn']).transform(np.mean)['Book-Rating']
train['town_avg'] = train.groupby(['town']).transform(np.mean)['Book-Rating']
train['book_author_q5'] = train.groupby(['Book-Author'])['Book-Rating'].transform(lambda x: x.quantile(q=0.5))
train['book_publisher_q5'] = train.groupby(['Publisher'])['Book-Rating'].transform(lambda x: x.quantile(q=0.5))
train['pyg_q5'] = train.groupby(['Publication_Year_gubn'])['Book-Rating'].transform(lambda x: x.quantile(q=0.5))
train['age_gubn_q5'] = train.groupby(['age_gubn'])['Book-Rating'].transform(lambda x: x.quantile(q=0.5))
train['town_q5'] = train.groupby(['town'])['Book-Rating'].transform(lambda x: x.quantile(q=0.5))
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\3521221108.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
train['ID_avg'] = train.groupby(['User-ID']).transform(np.mean)['Book-Rating']
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\3521221108.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
train['book_avg'] = train.groupby(['Book-ID']).transform(np.mean)['Book-Rating']
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\3521221108.py:4: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
train['book_author_avg'] = train.groupby(['Book-Author']).transform(np.mean)['Book-Rating']
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\3521221108.py:5: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
train['book_publisher_avg'] = train.groupby(['Publisher']).transform(np.mean)['Book-Rating']
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\3521221108.py:6: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
train['pyg_avg'] = train.groupby(['Publication_Year_gubn']).transform(np.mean)['Book-Rating']
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\3521221108.py:7: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
train['age_gubn_avg'] = train.groupby(['age_gubn']).transform(np.mean)['Book-Rating']
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\3521221108.py:8: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
train['town_avg'] = train.groupby(['town']).transform(np.mean)['Book-Rating']
In [16]:
# Book-rating 데이터
X1 = train[['ID_avg','book_avg']]
y1 = train[['Book-Rating']]
# User-ID 데이터
x11 = train[['age_gubn_avg','town_avg',
'age_gubn_q5','town_q5']]
y11 = train[['ID_avg']]
# Book-ID 데이터
x12 = train[['book_author_avg','book_publisher_avg','pyg_avg',
'book_author_q5','book_publisher_q5','pyg_q5']]
y12 = train[['book_avg']]
In [17]:
# Book-rating 예측 모델
X1 = train[['ID_avg','book_avg']]
y1 = train[['Book-Rating']]
# 성능평가지표(MSE)
from sklearn.metrics import mean_squared_error
# 데이터 분리
from sklearn.model_selection import train_test_split
x_train0, x_test0, y_train0, y_test0 = train_test_split(X1, y1, test_size=0.3, random_state=777)
from sklearn.linear_model import LinearRegression
rg=LinearRegression()
model = rg.fit(x_train0,y_train0)
y_rg=model.predict(x_test0)
score_rg=np.sqrt(mean_squared_error(y_test0, y_rg.reshape(-1,1)))
score_rg
Out[17]:
2.7483394595219135
In [18]:
# User-ID별 Book-rating 예측 모델
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x11, y11, test_size=0.3, random_state=777)
rg1=LinearRegression()
model1 = rg1.fit(x_train,y_train)
y_rg=model1.predict(x_test)
score_rg=np.sqrt(mean_squared_error(y_test, y_rg.reshape(-1,1)))
score_rg
Out[18]:
1.7261307359102553
In [19]:
# Book-ID별 Book-rating 예측 모델
from sklearn.model_selection import train_test_split
x1_train, x1_test, y1_train, y1_test = train_test_split(x12, y12, test_size=0.3, random_state=777)
rg2=LinearRegression()
model2 = rg2.fit(x1_train,y1_train)
y_rg=model2.predict(x1_test)
score_rg=np.sqrt(mean_squared_error(y1_test, y_rg.reshape(-1,1)))
score_rg
Out[19]:
1.5332398401527603
In [20]:
# Test 데이터에 Merge 작업 쿼리
ID = train[['User-ID','ID_avg']]
ID = ID.drop_duplicates(['User-ID'])
book = train[['Book-ID','book_avg']]
book = book.drop_duplicates(['Book-ID'])
book_author = train[['Book-Author','book_author_avg','book_author_q5']]
book_author = book_author.drop_duplicates(['Book-Author'])
Publisher = train[['Publisher','book_publisher_avg','book_publisher_q5']]
Publisher = Publisher.drop_duplicates(['Publisher'])
pyg = train[['Publication_Year_gubn','pyg_avg','pyg_q5']]
pyg = pyg.drop_duplicates(['Publication_Year_gubn'])
Age = train[['age_gubn','age_gubn_avg','age_gubn_q5']]
Age = Age.drop_duplicates(['age_gubn'])
town = train[['town','town_avg','town_q5']]
town = town.drop_duplicates(['town'])
test = pd.merge(test, ID, on = 'User-ID', how = 'left')
test = pd.merge(test, book, on = 'Book-ID', how = 'left')
test = pd.merge(test, book_author, on = 'Book-Author', how = 'left')
test = pd.merge(test, Publisher, on = 'Publisher', how = 'left')
test = pd.merge(test, pyg, on = 'Publication_Year_gubn', how = 'left')
test = pd.merge(test, Age, on = 'age_gubn', how = 'left')
test = pd.merge(test, town, on = 'town', how = 'left')
In [21]:
# User-ID별 Book-rating 예측 모델 사용을 위한 데이터 구축
x21 = test[['age_gubn_avg','town_avg',
'age_gubn_q5','town_q5']]
# Book-ID별 Book-rating 예측 모델 사용을 위한 데이터 구축
x22 = test[['book_author_avg','book_publisher_avg','pyg_avg',
'book_author_q5','book_publisher_q5','pyg_q5']]
# 평균으로 널값 처리
x21= x21.fillna(x21.mean())
x22= x22.fillna(x22.mean())
# User-ID별 Book-rating 예측 모델 사용
y_xgb=model1.predict(x21)
# Book-ID별 Book-rating 예측 모델 사용
y_xgb2=model2.predict(x22)
test['ID_other'] = y_xgb.reshape(-1,1)
test['Book_other'] = y_xgb2.reshape(-1,1)
In [22]:
# ID_avg와 book_avg null값시 예측 값으로 대체
test['ID_avg'] = np.where(pd.notnull(test['ID_avg']) == False,
test['ID_other'], test['ID_avg'])
test['book_avg'] = np.where(pd.notnull(test['book_avg']) == False,
test['Book_other'], test['book_avg'])
In [23]:
# Test 데이터 예측
X2 = test[['ID_avg','book_avg']]
y_rg=model.predict(X2)
y_rg
Out[23]:
array([[5.75362657],
[6.95246026],
[6.17161293],
...,
[3.29184657],
[2.31018802],
[1.26120854]])
In [24]:
ddf = pd.DataFrame(y_rg)
sample = pd.concat([sample, ddf], axis=1)
In [25]:
sample = sample.drop(['Book-Rating'] , axis=1)
sample.columns = ["ID", "Book-Rating"]
In [26]:
# 값 처리
sample['Book-Rating'] = np.where(sample['Book-Rating'] < 0, 0, sample['Book-Rating'])
sample['Book-Rating'] = np.where(sample['Book-Rating'] > 10, 10, sample['Book-Rating'])
In [27]:
# 파일 추출
sample.to_csv('C:/Users/20229069/Desktop/sample.csv', index=False)
In [30]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:80% !important;}</style>"))
C:\Users\20229069\AppData\Local\Temp\ipykernel_17752\529043151.py:1: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
from IPython.core.display import display, HTML
728x90
'Portfolio & Project > Project in Competition' 카테고리의 다른 글
[데이콘] 제2회 코스포 x 데이콘 도서 추천 알고리즘 AI경진대회(PDF) (0) | 2023.06.09 |
---|---|
Tabular Playground Series - Sep 2021_Modeling (0) | 2021.09.15 |
Tabular Playground Series - Sep 2021_EDA (0) | 2021.09.14 |