본문 바로가기
Learning-driven Methodology/ML (Machine Learning)

[Machine Learning] 의사 결정 트리 : 붓꽃 (Iris)

by goatlab 2022. 11. 29.
728x90
반응형
SMALL

데이터 로드

 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

iris_dt = load_iris()

train_x, test_x, train_y, test_y = train_test_split(iris_dt.data, iris_dt.target, 
                                                    test_size=0.2, random_state=0)

print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

 

모델 학습

 

tree_clf = DecisionTreeClassifier(random_state=0)
tree_clf.fit(train_x, train_y)
tree_clf.score(test_x, test_y)

 

트리 그리기

 

fn = iris_dt.feature_names # 각 특징의 이름
cn = iris_dt.target_names # 꽃의 이름

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4,4), dpi=300) # figure 모양을 설정
tree.plot_tree(tree_clf,
              feature_names=fn,
              class_names=cn,
              filled=True)

fig.savefig("tree.png")

 

중요도 확인

 

se0 = pd.Series(tree_clf.feature_importances_, index=fn)
se0
sepal length (cm)    0.000000
sepal width (cm)     0.012534
petal length (cm)    0.064446
petal width (cm)     0.923020
dtype: float64
se0.plot(kind='barh')

 

오버피팅 확인

 

def vilsualize_boudary(model, X, y):
    fig, ax = plt.subplots()
    
    ax.scatter(X[:, 0], X[:, 1], c=y, s=25, cmap='rainbow', edgecolor='k')
               #clim=(y.min(), y.max()), zorder=3)
    ax.axis('tight')
    ax.axis('off')
    
    xlim_start, xlim_end = ax.get_xlim()
    ylim_start, ylim_end = ax.get_ylim()
    
    # 모델을 학습하고 메시그리드를 사용해 다양한 X 값을 구성하고 예측한 값 생성 
    model.fit(X,y)
    xx, yy = np.meshgrid(np.linspace(xlim_start, xlim_end, num=200),
                        np.linspace(ylim_start, ylim_end, num=200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    # 예측한 정보를 뒷 배경에 색칠
    n_classes = len(np.unique(y)) # 타겟의 고유한 개수를 확인
    contours = ax.contourf(xx, yy, Z, alpha=0.3,
                levels=np.arange(n_classes + 1) - 0.5,
                cmap='rainbow') #, clim=(y.min(), y.max()), zorder=1)

vilsualize_boudary(tree_clf, train_x[:,2:], train_y)

# 샘플 개수에 제한을 줘서 모델 학습
new_tree_clf = DecisionTreeClassifier(random_state=0, min_samples_leaf=6)
new_tree_clf.fit(train_x, train_y)
vilsualize_boudary(new_tree_clf, train_x[:,2:], train_y)

fn = iris_dt.feature_names # 각 특징의 이름
cn = iris_dt.target_names # 꽃의 이름

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4,4), dpi=300) # figure 모양을 설정
tree.plot_tree(new_tree_clf,
              feature_names=fn,
              class_names=cn,
              filled=True)

fig.savefig("new_tree_clf.png")

for min_samples in range(1, 31, 10):
    test_tree_clf = DecisionTreeClassifier(random_state=0, min_samples_leaf=min_samples)
    test_tree_clf.fit(train_x, train_y)

    fn = iris_dt.feature_names # 각 특징의 이름
    cn = iris_dt.target_names # 꽃의 이름

    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4,4), dpi=300) # figure 모양을 설정
    tree.plot_tree(test_tree_clf,
                  feature_names=fn,
                  class_names=cn,
                  filled=True)
    plt.title(f"min samples: {min_samples}")
    plt.show()

728x90
반응형
LIST