SVM(Support Vector Machine) 파이썬 코드 공유

SVM은 분류 및 회귀 분석에서 사용할 수 있는 강력하고, 유연한 지도학습 알고리즘입니다. 본 포스팅에서는 파이썬으로 진행해 본 SVM(Support Vector Machine) 코드를 공유드려보겠습니다.

#SVM(Support Vector Machine) : 분류와 회귀 분석에 모두 사용할 수 있는 강력하고 유연한 지도 학습 알고리즘 
#임포트 작업
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# seaborn 플로팅의 기본 설정 사용
import seaborn as sns; sns.set()

In [11]:

#분류작업을 합니다.
#사진:분류를 위한 데이터, 여러선 그을 수 있는 옵션들이 있음 
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=50, centers=2,
                  random_state=0, cluster_std=0.60)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn');

In [12]:

#다음과 같은 분리선을 만들 수 있음 
#그림:데이터에 대해 3개의 완벽한 선형 판별 분류기가 존재함
#marked X가 어느쪽에 속해질지 선에 따라 달라짐 
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plt.plot([0.6], [2.1], 'x', color='red', markeredgewidth=2, markersize=10)

for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]:
    plt.plot(xfit, m * xfit + b, '-k')

plt.xlim(-1, 3.5);

In [13]:

#마진을 최대화하면 분류의 신뢰도가 높음 
#마진 : 각 선에서 주변의 가장 가까운 점에 이르는 너비
#마진 그리기
#마진 극대화하는 선이 최적의 모델 
#SVM은 최대 마진 추정기의 대표적인 예 
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')

for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
    yfit = m * xfit + b
    plt.plot(xfit, yfit, '-k')
    plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
                     color='#AAAAAA', alpha=0.4)

plt.xlim(-1, 3.5);

In [14]:

#SVM 적합시키기
#C 파라메터를 큰 숫자로 설정했음 
from sklearn.svm import SVC # "Support vector 분류기": SVM모델 훈련시키기 위함
model = SVC(kernel='linear', C=1E10)
model.fit(X, y)

Out[14]:

SVC(C=10000000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:

#위 SVM의 분리선을 그릴 함수 만들기 
def plot_svc_decision_function(model, ax=None, plot_support=True):
    """2차원 SVC를 위한 의사결정 함수 플로팅하기"""
    if ax is None:
        ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # 모델 평가를 위한 그리드 생성
    x = np.linspace(xlim[0], xlim[1], 30)
    y = np.linspace(ylim[0], ylim[1], 30)
    Y, X = np.meshgrid(y, x)
    xy = np.vstack([X.ravel(), Y.ravel()]).T
    P = model.decision_function(xy).reshape(X.shape)
    
    # 의사결정 경계와 마진 플로팅
    ax.contour(X, Y, P, colors='k',
               levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])
    
    # 서포트 벡터 플로팅 
    if plot_support:
        ax.scatter(model.support_vectors_[:, 0],
                   model.support_vectors_[:, 1],
                   s=300, linewidth=1, facecolors='none');
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

In [16]:

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model);
#마진최대화하는 분리선이 나타남

In [17]:

model.support_vectors_
#Scikit-learn에서 이 점들(support vector)은 support_vectors_속성에 저장됨
"""이 분류기가 성공할 수 있었던 핵심은 적합에서 오직 서포트 벡터의 위치만 중요하게 여긴다는 점. 
올바른 쪽에 위차한 마진으로부터 멀리 떨어진 점들은 적합을 변경하지 않음.
이 점들은 마진을 지나가지 않는 한 모델을 적합하는 데 사용되는 손실 함수에 기여하지 않기 때문에 위치나 개수 중요하지않음 """

Out[17]:

array([[0.44359863, 3.11530945],
       [2.33812285, 3.43116792],
       [2.06156753, 1.96918596]])

In [18]:

#예를들어, 60개점과 120개의 점 그러나 모델은 바뀌지 않았음 => 마진에서 떨어진 점들이 어디에 위치하든 무감각한것이 SVM의 강점 
def plot_svm(N=10, ax=None):
    X, y = make_blobs(n_samples=200, centers=2,
                      random_state=0, cluster_std=0.60)
    X = X[:N]
    y = y[:N]
    model = SVC(kernel='linear', C=1E10)
    model.fit(X, y)
    
    ax = ax or plt.gca()
    ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
    ax.set_xlim(-1, 4)
    ax.set_ylim(-1, 6)
    plot_svc_decision_function(model, ax)

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for axi, N in zip(ax, [60, 120]):
    plot_svm(N, axi)
    axi.set_title('N = {0}'.format(N))

In [19]:

#확인가능 
from ipywidgets import interact, fixed
interact(plot_svm, N=[10, 200], ax=fixed(None));

In [20]:

#커널 SVM의 필요성
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(100, factor=.1, noise=.1)

clf = SVC(kernel='linear').fit(X, y)

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf, plot_support=False);
#사진 : 선형적으로 분리되지 않는 데이터. 어떻게 나누지?

In [21]:

#첫번째 : 방사형 기저 함수(radial basis function)계산 : 고차원에서 가운데 무리에 중심을 둔다. 
r = np.exp(-(X ** 2).sum(1))

In [22]:

#3차원으로 만든 위 데이터분포
#차원을 추가하여 선형 분리 가능하게함 
#이걸 달성할 수 있는 방법이 커널 변환(데이터세트의 모든 점에 중심을 둔 기저 함수를 계산하고 SVM알고리즘이 그 결과 추려내게하는 기저 함수 변환)
#각 쌍의 점들 사이의 유사관계(커널)에 기반을 두고있기 때문.
from mpl_toolkits import mplot3d

def plot_3D(elev=30, azim=30, X=X, y=y):
    ax = plt.subplot(projection='3d')
    ax.scatter3D(X[:, 0], X[:, 1], r, c=y, s=50, cmap='autumn')
    ax.view_init(elev=elev, azim=azim)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('r')

interact(plot_3D, elev=[-90, 90], azip=(-180, 180),
         X=fixed(X), y=fixed(y));

In [23]:

#커널을 사용해 선형 커널을 RBF(radial basis function,방사형 기저 함수)커널로 바꿈으로써 커널 SVM적용 가능 
clf = SVC(kernel='rbf', C=1E6)
clf.fit(X, y)

Out[23]:

SVC(C=1000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
            s=300, lw=1, facecolors='none');
#사진은 데이터에 커널 SVM적합시킨경우임 적절한 비선형 결정 경계 알 수 있게 됨
#이러한 커널 변환 전략 : 머신러닝에서 빠른 선형기법을 빠른 비선형 기법으로 전환하기 위해 사용, 커널 기법 사용할 수 있는 모델에서 자주 사용됨

In [25]:

#데이터가 어느 정도 겹치는 경우에는 어떻게 되나
#사진 : 겹치는 데이터
X, y = make_blobs(n_samples=100, centers=2,
                  random_state=0, cluster_std=1.2)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn');

In [29]:

#이때 SVM은 마진을 부드럽게하는 약간의 퍼지인자(fudge-factor)를 제공함
#더 나은 적합 만들 수 있다면 점 일부 마진에 들어가는 것 허용한다는 뜻임
#마진의 강도 : C에 의해 제어됨 
#C값 : 마진 강도 조정 모수
#C가 클 경우 : 마진이 단단해 점들이 그 안에 존재할 수 없음
#C가 작은 경우 : 마진이 부드러워 일부 점이 그 안에 포함됨 
#C가 너무커지면 과적합의 위험성있어서 최적화 하는 작업 필요
X, y = make_blobs(n_samples=100, centers=2,
                  random_state=0, cluster_std=0.8)

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

for axi, C in zip(ax, [10.0, 0.1]):
    model = SVC(kernel='linear', C=C).fit(X, y)
    axi.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
    plot_svc_decision_function(model, axi)
    axi.scatter(model.support_vectors_[:, 0],
                model.support_vectors_[:, 1],
                s=300, lw=1, facecolors='none');
    axi.set_title('C = {0:.1f}'.format(C), size=14)
#사진 : 서포트 벡터 적합에 모수 C가 미치는 영향

In [36]:

"""
SVM 장점 
1. 비교적 적은 수의 서포트 벡터에 의존하여 간결하고 적은 메모리 사용
2. 모델이 훈련되면 예측 단계가 빠르게 수행됨
3. 마진에 인접한 점에 의해서만 영향 받기 때문에 고차원 데이터에서 잘 작동함
4. 여러 유형의 데이터에 적응시킬 수 있어 다양한 용도로 사용 가능
SVM 단점 
1. 훈련 표본의 개수가 클 경우 좋지 않음
2. 어떤 C값을 선택했느냐에 따라 결과가 달라짐 이 값은 교차 검증을 통해 신중하게 선택 해야하는데 데이터세트 커질수록 비용 커질 수 있음
3. 결과에 대해 직접적으로 확률적 해석 할 수 없음 추정만 가능한데 이 과정이 많은 비용을 발생시킴
"""

Out[36]:

'\nSVM 장점 \n1. 비교적 적은 수의 서포트 벡터에 의존하여 간결하고 적은 메모리 사용\n2. 모델이 훈련되면 예측 단계가 빠르게 수행됨\n3. 마진에 인접한 점에 의해서만 영향 받기 때문에 고차원 데이터에서 잘 작동함\n4. 여러 유형의 데이터에 적응시킬 수 있어 다양한 용도로 사용 가능\nSVM 단점 \n1. 훈련 표본의 개수가 클 경우 좋지 않음\n2. 어떤 C값을 선택했느냐에 따라 결과가 달라짐 이 값은 교차 검증을 통해 신중하게 선택 해야하는데 데이터세트 커질수록 비용 커질 수 있음\n3. 결과에 대해 직접적으로 확률적 해석 할 수 없음 추정만 가능한데 이 과정이 많은 비용을 발생시킴\n'

In [37]:

print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets


def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


# 데이터 가져오기
iris = datasets.load_iris()
# 꽃받침 넓이와 길이 두개만 사용
X = iris.data[:, :2]
y = iris.target

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter를 1로 둠
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel')

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()

Automatically created module for IPython interactive environment