# 공통 모듈 임포트
import numpy as np
import os

# mnist 데이터 불러오기
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

C:\Users\20229069\AppData\Local\anaconda3\Lib\site-packages\sklearn\datasets\_openml.py:1002: FutureWarning: The default value of `parser` will change from `'liac-arff'` to `'auto'` in 1.4. You can set `parser='auto'` to silence this warning. Therefore, an `ImportError` will be raised from 1.4 if the dataset is dense and pandas is not installed. Note that the pandas parser may return different data types. See the Notes Section in fetch_openml's API doc for details.
  warn(


X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)


y.shape

(70000,)


# 0행의 그림
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X.loc[[0]]
some_digit = some_digit.to_numpy()
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")

plt.show()


# 0행의 값
y[0]

'5'


y = y.astype(np.uint8)
# 훈련 세트와 테스트 세트
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]


# 10개 숫자 들어있는지 확인
y_train.value_counts()

1    6742
7    6265
3    6131
2    5958
9    5949
0    5923
6    5918
8    5851
4    5842
5    5421
Name: class, dtype: int64


y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)


# SGDClassifier 훈련
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(random_state=42)

SGDClassifier(random_state=42)


# 예측
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([0.95035, 0.96035, 0.9604 ])


from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)


from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

array([[53892,   687],
       [ 1891,  3530]], dtype=int64)


# 오차 행렬 값 추출
from sklearn.metrics import precision_score, recall_score

print("정밀도 : {:.2f}" .format(precision_score(y_train_5, y_train_pred))) # 정밀도 구하는 코드

print("재현율 : {:.2f}" .format(recall_score(y_train_5, y_train_pred))) # 재현율 구하는 코드

from sklearn.metrics import f1_score

print("F1 값 : {:.2f}" .format(f1_score(y_train_5, y_train_pred))) # F1값 구하는 코드

정밀도 : 0.84
재현율 : 0.65
F1 값 : 0.73


# 모든 결정 점수 값 추출
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) # Not shown in the book
    plt.xlabel("Threshold", fontsize=16)        # Not shown
    plt.grid(True)                              # Not shown
    plt.axis([-50000, 50000, 0, 1])             # Not shown

recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

plt.figure(figsize=(8, 4))                                                                  # Not shown
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:")                 # Not shown
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:")                                # Not shown
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")# Not shown
plt.plot([threshold_90_precision], [0.9], "ro")                                             # Not shown
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                             # Not shown
plt.show()


# 정밀도/재현율 트레이드오프 그래프
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.plot([recall_90_precision, recall_90_precision], [0., 0.9], "r:")
plt.plot([0.0, recall_90_precision], [0.9, 0.9], "r:")
plt.plot([recall_90_precision], [0.9], "ro")
plt.show()


from sklearn.metrics import roc_curve
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")

y_scores_forest = y_probas_forest[:, 1] # 점수 = 양성 클래스의 확률
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)


def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # 대각 점선
    plt.axis([0, 1, 0, 1])                                    # Not shown in the book
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
    plt.grid(True)


# SGD와 랜덤포레스트 ROC 성능 비교
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
plt.show()


from IPython.core.display import display, HTML
display(HTML("<style>.container {width:80% !important;}</style>"))

C:\Users\20229069\AppData\Local\Temp\ipykernel_18208\529043151.py:1: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
  from IPython.core.display import display, HTML

[핸즈온 머신러닝] Chapter 6. 결정 트리 (0)	2024.04.01
[핸즈온 머신러닝] Chapter 5. 서포트 벡터 머신 (0)	2023.09.12
[핸즈온 머신러닝] Chapter 4. 모델 훈련 (0)	2023.09.06
[핸즈온 머신러닝] Chapter 2. 머신러닝 프로젝트 처음부터 끝까지 (0)	2023.08.04
[핸즈온 머신러닝] Chapter 1. 한눈에 보는 머신러닝 (0)	2023.07.26

#wannabeeeeeee the best DataScientist

#wannabeeeeeee the best DataScientist

[핸즈온 머신러닝] Chapter 3. 분류 본문

[핸즈온 머신러닝] Chapter 3. 분류

3장 분류¶

3.1 MNIST¶

3.2 이진 분류기 훈련¶

3.3 성능 측정¶

3.3.5 ROC 곡선¶

3.4 다중 분류¶

3.5 에러 분석¶

3.6 다중 레이블 분류¶

3.7 다중 출력 분류¶

'Book report > 핸즈온 머신러닝' 카테고리의 다른 글

티스토리툴바

« 2025/05 »
일	월	화	수	목	금	토
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31